Skip to content

Commit a3f6d7d

Browse files
zfergusCopilot
andauthored
Add SIMD support via xsimd library (#207)
* Add optional SIMD support via xsimd - Add cross-platform SIMD in LBVH using the xsimd library * Generalize SIMD traversal to variable width * Fix issue with abs(double) casting to int * Remove SIMD preset from CMake configuration and update lbvh.cpp buffer initialization * Use EIGEN_USING_STD for std::abs to avoid casting issues * Apply suggestions from code review Co-authored-by: Copilot <[email protected]> * Refactor comments for clarity in SIMD query intersection logic --------- Co-authored-by: Copilot <[email protected]>
1 parent a9ece21 commit a3f6d7d

File tree

7 files changed

+103
-56
lines changed

7 files changed

+103
-56
lines changed

CMakeLists.txt

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ else()
7575
endif()
7676

7777
option(IPC_TOOLKIT_WITH_CUDA "Enable CUDA CCD" OFF)
78+
option(IPC_TOOLKIT_WITH_SIMD "Enable SIMD" ON)
7879
option(IPC_TOOLKIT_WITH_RATIONAL_INTERSECTION "Use rational edge-triangle intersection check" OFF)
7980
option(IPC_TOOLKIT_WITH_ROBIN_MAP "Use Tessil's robin-map rather than std maps" ON)
8081
option(IPC_TOOLKIT_WITH_ABSEIL "Use Abseil's hash functions" ON)
@@ -83,10 +84,8 @@ option(IPC_TOOLKIT_WITH_INEXACT_CCD "Use the original inexact CCD meth
8384
option(IPC_TOOLKIT_WITH_PROFILER "Enable performance profiler" OFF)
8485

8586
# Advanced options
86-
option(IPC_TOOLKIT_WITH_SIMD "Enable SIMD" OFF)
8787
option(IPC_TOOLKIT_WITH_CODE_COVERAGE "Enable coverage reporting" OFF)
8888

89-
mark_as_advanced(IPC_TOOLKIT_WITH_SIMD) # This does not work reliably
9089
mark_as_advanced(IPC_TOOLKIT_WITH_CODE_COVERAGE) # This is used in GitHub Actions
9190

9291
# Set default minimum C++ standard
@@ -112,9 +111,10 @@ include(ipc_toolkit_use_colors)
112111
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
113112

114113
################################################################################
115-
# CUDA
114+
# Verify Options
116115
################################################################################
117116

117+
# CUDA support
118118
if(IPC_TOOLKIT_WITH_CUDA)
119119
# If CMAKE_CUDA_ARCHITECTURES was not specified, set it to native.
120120
if(DEFINED CMAKE_CUDA_ARCHITECTURES)
@@ -129,6 +129,19 @@ if(IPC_TOOLKIT_WITH_CUDA)
129129
enable_language(CUDA)
130130
endif()
131131

132+
## SIMD support
133+
if(IPC_TOOLKIT_WITH_SIMD)
134+
# Figure out SIMD support
135+
message(STATUS "Testing SIMD capabilities...")
136+
find_package(SIMD)
137+
if (SIMD_CXX_FLAGS)
138+
message(STATUS "SIMD support found: ${SIMD_CXX_FLAGS}")
139+
else()
140+
message(WARNING "SIMD support requested but not found. Continuing without SIMD.")
141+
set(IPC_TOOLKIT_WITH_SIMD OFF CACHE BOOL "Enable SIMD" FORCE)
142+
endif()
143+
endif()
144+
132145
################################################################################
133146
# IPC Toolkit Library
134147
################################################################################
@@ -247,14 +260,15 @@ target_link_libraries(ipc_toolkit PRIVATE ipc::toolkit::warnings)
247260

248261
## SIMD support
249262
if(IPC_TOOLKIT_WITH_SIMD)
250-
# Figure out SIMD support
251-
message(STATUS "Testing SIMD capabilities...")
252-
find_package(SIMD)
253263
# Add SIMD flags to compiler flags
254-
message(STATUS "Using SIMD flags: ${SIMD_FLAGS}")
255-
target_compile_options(ipc_toolkit PRIVATE ${SIMD_FLAGS})
256-
else()
257-
message(STATUS "SIMD support disabled")
264+
target_compile_options(ipc_toolkit PRIVATE ${SIMD_CXX_FLAGS})
265+
266+
# Link against cross-platform xsimd library
267+
include(xsimd)
268+
target_link_libraries(ipc_toolkit PRIVATE xsimd::xsimd)
269+
270+
# Disable vectorization in Eigen since I've found it to have alignment issues.
271+
target_compile_definitions(Eigen3_Eigen INTERFACE EIGEN_DONT_VECTORIZE=1)
258272
endif()
259273

260274
# For MSVC, do not use the min and max macros.

CMakePresets.json

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -51,16 +51,6 @@
5151
"IPC_TOOLKIT_WITH_CUDA": "ON"
5252
}
5353
},
54-
{
55-
"name": "simd",
56-
"inherits": "release",
57-
"displayName": "SIMD Enabled",
58-
"description": "Build with SIMD optimizations",
59-
"binaryDir": "${sourceDir}/build/simd",
60-
"cacheVariables": {
61-
"IPC_TOOLKIT_WITH_SIMD": "ON"
62-
}
63-
},
6454
{
6555
"name": "test",
6656
"inherits": "debug",
@@ -82,7 +72,6 @@
8272
"cacheVariables": {
8373
"IPC_TOOLKIT_BUILD_PYTHON": "ON",
8474
"IPC_TOOLKIT_BUILD_TESTS": "OFF",
85-
"IPC_TOOLKIT_WITH_SIMD": "OFF",
8675
"IPC_TOOLKIT_WITH_CUDA": "OFF"
8776
}
8877
},
@@ -166,4 +155,4 @@
166155
}
167156
}
168157
]
169-
}
158+
}

IPCToolkitOptions.cmake.sample

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,12 +31,12 @@
3131
# option(IPC_TOOLKIT_BUILD_TESTS "Build unit-tests" ON)
3232
# option(IPC_TOOLKIT_BUILD_PYTHON "Build Python bindings" OFF)
3333
# option(IPC_TOOLKIT_WITH_CUDA "Enable CUDA CCD" OFF)
34+
# option(IPC_TOOLKIT_WITH_SIMD "Enable SIMD" ON)
3435
# option(IPC_TOOLKIT_WITH_RATIONAL_INTERSECTION "Use rational edge-triangle intersection check" OFF)
3536
# option(IPC_TOOLKIT_WITH_ROBIN_MAP "Use Tessil's robin-map rather than std maps" ON)
3637
# option(IPC_TOOLKIT_WITH_ABSEIL "Use Abseil's hash functions" ON)
3738
# option(IPC_TOOLKIT_WITH_FILIB "Use filib for interval arithmetic" ON)
3839
# option(IPC_TOOLKIT_WITH_INEXACT_CCD "Use the original inexact CCD method of IPC" OFF)
39-
# option(IPC_TOOLKIT_WITH_SIMD "Enable SIMD" OFF)
4040
# option(IPC_TOOLKIT_WITH_CODE_COVERAGE "Enable coverage reporting" OFF)
4141
# option(IPC_TOOLKIT_TESTS_CCD_BENCHMARK "Enable CCD benchmark test" ON)
4242
# set(IPC_TOOLKIT_TESTS_CCD_BENCHMARK_DIR "" CACHE PATH "Path to the CCD benchmark directory")

cmake/recipes/xsimd.cmake

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# xsimd (https://github.com/xtensor-stack/xsimd)
2+
# License: BSD-3-Clause
3+
if(TARGET xsimd::xsimd)
4+
return()
5+
endif()
6+
7+
message(STATUS "Third-party: creating target 'xsimd::xsimd'")
8+
9+
include(CPM)
10+
CPMAddPackage("gh:xtensor-stack/xsimd#14.0.0")
11+
12+
add_library(xsimd::xsimd ALIAS xsimd)
13+
14+
# Folder name for IDE
15+
set_target_properties(xsimd PROPERTIES FOLDER "ThirdParty")

src/ipc/broad_phase/lbvh.cpp

Lines changed: 54 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,16 @@
99
#include <tbb/parallel_for.h>
1010
#include <tbb/parallel_sort.h>
1111

12-
#ifdef __APPLE__
13-
// We utilize SIMD registers to compare 1 Node against 4 Queries simultaneously.
14-
#include <simd/simd.h>
12+
#ifdef IPC_TOOLKIT_WITH_SIMD
13+
// We utilize SIMD registers to compare one node against multiple queries
14+
// simultaneously, with the number of queries determined by
15+
// xs::batch<float>::size.
16+
#include <xsimd/xsimd.hpp>
17+
namespace xs = xsimd;
1518
#endif
1619

20+
#include <array>
21+
1722
using namespace std::placeholders;
1823

1924
namespace ipc {
@@ -448,9 +453,9 @@ namespace {
448453
} while (node_idx != LBVH::Node::INVALID_POINTER); // Same as root
449454
}
450455

451-
#ifdef __APPLE__
456+
#ifdef IPC_TOOLKIT_WITH_SIMD
452457
// SIMD Traversal
453-
// Traverses 4 queries simultaneously using SIMD.
458+
// Traverses multiple queries simultaneously using SIMD.
454459
template <typename Candidate, bool swap_order, bool triangular>
455460
void traverse_lbvh_simd(
456461
const LBVH::Node* queries,
@@ -459,28 +464,37 @@ namespace {
459464
const std::function<bool(size_t, size_t)>& can_collide,
460465
std::vector<Candidate>& candidates)
461466
{
462-
assert(n_queries >= 1 && n_queries <= 4);
463-
// Load 4 queries into single registers (Structure of Arrays)
464-
auto make_simd = [&](auto F) -> simd_float4 {
465-
return simd_float4 {
466-
F(0),
467-
n_queries > 1 ? F(1) : 0.0f,
468-
n_queries > 2 ? F(2) : 0.0f,
469-
n_queries > 3 ? F(3) : 0.0f,
470-
};
467+
using batch_t = xs::batch<float>;
468+
assert(n_queries >= 1 && n_queries <= batch_t::size);
469+
470+
// Load queries into single registers
471+
auto make_simd = [&](auto F) -> batch_t {
472+
// 1. Create a buffer of the correct architecture-dependent size
473+
alignas(xs::default_arch::alignment())
474+
std::array<float, batch_t::size>
475+
buffer {};
476+
477+
#pragma unroll
478+
// 2. Fill the buffer, respecting the actual number of queries
479+
for (size_t i = 0; i < batch_t::size; ++i) {
480+
buffer[i] = (i < n_queries) ? F(static_cast<int>(i)) : 0.0f;
481+
}
482+
483+
// 3. Load the buffer into the SIMD register
484+
return batch_t::load_aligned(buffer.data());
471485
};
472486

473-
const simd_float4 q_min_x =
487+
const auto q_min_x =
474488
make_simd([&](int k) { return queries[k].aabb_min.x(); });
475-
const simd_float4 q_min_y =
489+
const auto q_min_y =
476490
make_simd([&](int k) { return queries[k].aabb_min.y(); });
477-
const simd_float4 q_min_z =
491+
const auto q_min_z =
478492
make_simd([&](int k) { return queries[k].aabb_min.z(); });
479-
const simd_float4 q_max_x =
493+
const auto q_max_x =
480494
make_simd([&](int k) { return queries[k].aabb_max.x(); });
481-
const simd_float4 q_max_y =
495+
const auto q_max_y =
482496
make_simd([&](int k) { return queries[k].aabb_max.y(); });
483-
const simd_float4 q_max_z =
497+
const auto q_max_z =
484498
make_simd([&](int k) { return queries[k].aabb_max.z(); });
485499

486500
// Use a fixed-size array as a stack to avoid dynamic allocations
@@ -505,31 +519,33 @@ namespace {
505519
const LBVH::Node& child_l = lbvh[node.left];
506520
const LBVH::Node& child_r = lbvh[node.right];
507521

508-
// 1. Intersect 4 queries at once
522+
// 1. Intersect multiple queries at once
509523
// (child_l.min <= query.max) && (query.min <= child_l.max)
510-
const simd_int4 intersects_l = (child_l.aabb_min.x() <= q_max_x)
524+
const xs::batch_bool<float> intersects_l =
525+
(child_l.aabb_min.x() <= q_max_x)
511526
& (child_l.aabb_min.y() <= q_max_y)
512527
& (child_l.aabb_min.z() <= q_max_z)
513528
& (q_min_x <= child_l.aabb_max.x())
514529
& (q_min_y <= child_l.aabb_max.y())
515530
& (q_min_z <= child_l.aabb_max.z());
516531

517-
// 2. Intersect 4 queries at once
532+
// 2. Intersect multiple queries at once
518533
// (child_r.min <= query.max) && (query.min <= child_r.max)
519-
const simd_int4 intersects_r = (child_r.aabb_min.x() <= q_max_x)
534+
const xs::batch_bool<float> intersects_r =
535+
(child_r.aabb_min.x() <= q_max_x)
520536
& (child_r.aabb_min.y() <= q_max_y)
521537
& (child_r.aabb_min.z() <= q_max_z)
522538
& (q_min_x <= child_r.aabb_max.x())
523539
& (q_min_y <= child_r.aabb_max.y())
524540
& (q_min_z <= child_r.aabb_max.z());
525541

526-
const bool any_intersects_l = simd_any(intersects_l);
527-
const bool any_intersects_r = simd_any(intersects_r);
542+
const bool any_intersects_l = xs::any(intersects_l);
543+
const bool any_intersects_r = xs::any(intersects_r);
528544

529545
// Query overlaps a leaf node => report collision
530546
if (any_intersects_l && child_l.is_leaf()) {
531547
for (int k = 0; k < n_queries; ++k) {
532-
if (intersects_l[k]) {
548+
if (intersects_l.get(k)) {
533549
attempt_add_candidate<
534550
Candidate, swap_order, triangular>(
535551
queries[k], child_l, can_collide, candidates);
@@ -538,7 +554,7 @@ namespace {
538554
}
539555
if (any_intersects_r && child_r.is_leaf()) {
540556
for (int k = 0; k < n_queries; ++k) {
541-
if (intersects_r[k]) {
557+
if (intersects_r.get(k)) {
542558
attempt_add_candidate<
543559
Candidate, swap_order, triangular>(
544560
queries[k], child_r, can_collide, candidates);
@@ -576,9 +592,12 @@ namespace {
576592
const std::function<bool(size_t, size_t)>& can_collide,
577593
tbb::enumerable_thread_specific<std::vector<Candidate>>& storage)
578594
{
579-
#ifdef __APPLE__ // Only support SIMD on Apple platforms for now
580-
constexpr size_t SIMD_SIZE = use_simd ? 4 : 1;
581-
constexpr size_t GRAIN_SIZE = use_simd ? 16 : 1;
595+
#ifdef IPC_TOOLKIT_WITH_SIMD // Enable SIMD acceleration when available
596+
constexpr size_t SIMD_SIZE = use_simd ? xs::batch<float>::size : 1;
597+
static_assert(
598+
64 % xs::batch<float>::size == 0, "GRAIN_SIZE must be an integer");
599+
constexpr size_t GRAIN_SIZE =
600+
use_simd ? (64 / xs::batch<float>::size) : 1;
582601
#else
583602
constexpr size_t SIMD_SIZE = 1;
584603
constexpr size_t GRAIN_SIZE = 1;
@@ -595,11 +614,13 @@ namespace {
595614
tbb::blocked_range<size_t>(size_t(0), n_tasks, GRAIN_SIZE),
596615
[&](const tbb::blocked_range<size_t>& r) {
597616
auto& local_candidates = storage.local();
617+
#ifdef IPC_TOOLKIT_WITH_SIMD
598618
const size_t actual_end = // Handle tail case
599619
std::min(SIMD_SIZE * r.end(), n_source_leaves);
620+
#endif
600621
for (size_t i = r.begin(); i < r.end(); ++i) {
601622
const size_t idx = SIMD_SIZE * i;
602-
#ifdef __APPLE__
623+
#ifdef IPC_TOOLKIT_WITH_SIMD
603624
if constexpr (use_simd) {
604625
assert(actual_end - idx >= 1);
605626
traverse_lbvh_simd<Candidate, swap_order, triangular>(
@@ -611,7 +632,7 @@ namespace {
611632
traverse_lbvh<Candidate, swap_order, triangular>(
612633
source[source_leaf_offset + idx], target,
613634
can_collide, local_candidates);
614-
#ifdef __APPLE__
635+
#ifdef IPC_TOOLKIT_WITH_SIMD
615636
}
616637
#endif
617638
}

src/ipc/config.hpp.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#cmakedefine IPC_TOOLKIT_WITH_INEXACT_CCD
1414
#cmakedefine IPC_TOOLKIT_WITH_RATIONAL_INTERSECTION
1515
#cmakedefine IPC_TOOLKIT_WITH_CUDA
16+
#cmakedefine IPC_TOOLKIT_WITH_SIMD
1617
#cmakedefine IPC_TOOLKIT_WITH_ROBIN_MAP
1718
#cmakedefine IPC_TOOLKIT_WITH_ABSEIL
1819
#cmakedefine IPC_TOOLKIT_WITH_FILIB

src/ipc/utils/eigen_ext.hpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,13 @@
55

66
#include <cassert>
77

8+
#ifdef EIGEN_DONT_VECTORIZE
9+
// NOTE: Avoid error about abs casting double to int. Eigen does this
10+
// internally but seemingly only if EIGEN_DONT_VECTORIZE is not defined.
11+
// TODO: We should always use std::abs to avoid this issue.
12+
EIGEN_USING_STD(abs); // using std::abs;
13+
#endif
14+
815
namespace Eigen {
916
template <typename T> using RowRef = Ref<T, 0, Eigen::InnerStride<>>;
1017
template <typename T> using ConstRef = const Ref<const T>&;

0 commit comments

Comments
 (0)