Add SIMD support via xsimd library (#207)

zfergus · Copilot · web-flow · commit a3f6d7dada10 · 2026-01-26T13:47:22.000-05:00
* Add optional SIMD support via xsimd

- Add cross-platform SIMD in LBVH using the xsimd library

* Generalize SIMD traversal to variable width

* Fix issue with abs(double) casting to int

* Remove SIMD preset from CMake configuration and update lbvh.cpp buffer initialization

* Use EIGEN_USING_STD for std::abs to avoid casting issues

* Apply suggestions from code review

Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;

* Refactor comments for clarity in SIMD query intersection logic

---------

Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -75,6 +75,7 @@ else()
 endif()
 
 option(IPC_TOOLKIT_WITH_CUDA                  "Enable CUDA CCD"                               OFF)
+option(IPC_TOOLKIT_WITH_SIMD                  "Enable SIMD"                                    ON)
 option(IPC_TOOLKIT_WITH_RATIONAL_INTERSECTION "Use rational edge-triangle intersection check" OFF)
 option(IPC_TOOLKIT_WITH_ROBIN_MAP             "Use Tessil's robin-map rather than std maps"    ON)
 option(IPC_TOOLKIT_WITH_ABSEIL                "Use Abseil's hash functions"                    ON)
@@ -83,10 +84,8 @@ option(IPC_TOOLKIT_WITH_INEXACT_CCD           "Use the original inexact CCD meth
 option(IPC_TOOLKIT_WITH_PROFILER              "Enable performance profiler"                   OFF)
 
 # Advanced options
-option(IPC_TOOLKIT_WITH_SIMD                  "Enable SIMD"                                   OFF)
 option(IPC_TOOLKIT_WITH_CODE_COVERAGE         "Enable coverage reporting"                     OFF)
 
-mark_as_advanced(IPC_TOOLKIT_WITH_SIMD)          # This does not work reliably
 mark_as_advanced(IPC_TOOLKIT_WITH_CODE_COVERAGE) # This is used in GitHub Actions
 
 # Set default minimum C++ standard
@@ -112,9 +111,10 @@ include(ipc_toolkit_use_colors)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 
 ################################################################################
-# CUDA
+# Verify Options
 ################################################################################
 
+# CUDA support
 if(IPC_TOOLKIT_WITH_CUDA)
   # If CMAKE_CUDA_ARCHITECTURES was not specified, set it to native.
   if(DEFINED CMAKE_CUDA_ARCHITECTURES)
@@ -129,6 +129,19 @@ if(IPC_TOOLKIT_WITH_CUDA)
   enable_language(CUDA)
 endif()
 
+## SIMD support
+if(IPC_TOOLKIT_WITH_SIMD)
+  # Figure out SIMD support
+  message(STATUS "Testing SIMD capabilities...")
+  find_package(SIMD)
+  if (SIMD_CXX_FLAGS)
+    message(STATUS "SIMD support found: ${SIMD_CXX_FLAGS}")
+  else()
+    message(WARNING "SIMD support requested but not found. Continuing without SIMD.")
+    set(IPC_TOOLKIT_WITH_SIMD OFF CACHE BOOL "Enable SIMD" FORCE)
+  endif()
+endif()
+
 ################################################################################
 # IPC Toolkit Library
 ################################################################################
@@ -247,14 +260,15 @@ target_link_libraries(ipc_toolkit PRIVATE ipc::toolkit::warnings)
 
 ## SIMD support
 if(IPC_TOOLKIT_WITH_SIMD)
-  # Figure out SIMD support
-  message(STATUS "Testing SIMD capabilities...")
-  find_package(SIMD)
   # Add SIMD flags to compiler flags
-  message(STATUS "Using SIMD flags: ${SIMD_FLAGS}")
-  target_compile_options(ipc_toolkit PRIVATE ${SIMD_FLAGS})
-else()
-  message(STATUS "SIMD support disabled")
+  target_compile_options(ipc_toolkit PRIVATE ${SIMD_CXX_FLAGS})
+
+  # Link against cross-platform xsimd library
+  include(xsimd)
+  target_link_libraries(ipc_toolkit PRIVATE xsimd::xsimd)
+
+  # Disable vectorization in Eigen since I've found it to have alignment issues.
+  target_compile_definitions(Eigen3_Eigen INTERFACE EIGEN_DONT_VECTORIZE=1)
 endif()
 
 # For MSVC, do not use the min and max macros.
diff --git a/CMakePresets.json b/CMakePresets.json
@@ -51,16 +51,6 @@
         "IPC_TOOLKIT_WITH_CUDA": "ON"
       }
     },
-    {
-      "name": "simd",
-      "inherits": "release",
-      "displayName": "SIMD Enabled",
-      "description": "Build with SIMD optimizations",
-      "binaryDir": "${sourceDir}/build/simd",
-      "cacheVariables": {
-        "IPC_TOOLKIT_WITH_SIMD": "ON"
-      }
-    },
     {
       "name": "test",
       "inherits": "debug",
@@ -82,7 +72,6 @@
       "cacheVariables": {
         "IPC_TOOLKIT_BUILD_PYTHON": "ON",
         "IPC_TOOLKIT_BUILD_TESTS": "OFF",
-        "IPC_TOOLKIT_WITH_SIMD": "OFF",
         "IPC_TOOLKIT_WITH_CUDA": "OFF"
       }
     },
@@ -166,4 +155,4 @@
       }
     }
   ]
-}
+}
diff --git a/IPCToolkitOptions.cmake.sample b/IPCToolkitOptions.cmake.sample
@@ -31,12 +31,12 @@
 # option(IPC_TOOLKIT_BUILD_TESTS                "Build unit-tests"                               ON)
 # option(IPC_TOOLKIT_BUILD_PYTHON               "Build Python bindings"                         OFF)
 # option(IPC_TOOLKIT_WITH_CUDA                  "Enable CUDA CCD"                               OFF)
+# option(IPC_TOOLKIT_WITH_SIMD                  "Enable SIMD"                                    ON)
 # option(IPC_TOOLKIT_WITH_RATIONAL_INTERSECTION "Use rational edge-triangle intersection check" OFF)
 # option(IPC_TOOLKIT_WITH_ROBIN_MAP             "Use Tessil's robin-map rather than std maps"    ON)
 # option(IPC_TOOLKIT_WITH_ABSEIL                "Use Abseil's hash functions"                    ON)
 # option(IPC_TOOLKIT_WITH_FILIB                 "Use filib for interval arithmetic"              ON)
 # option(IPC_TOOLKIT_WITH_INEXACT_CCD           "Use the original inexact CCD method of IPC"    OFF)
-# option(IPC_TOOLKIT_WITH_SIMD                  "Enable SIMD"                                   OFF)
 # option(IPC_TOOLKIT_WITH_CODE_COVERAGE         "Enable coverage reporting"                     OFF)
 # option(IPC_TOOLKIT_TESTS_CCD_BENCHMARK        "Enable CCD benchmark test"                      ON)
 # set(IPC_TOOLKIT_TESTS_CCD_BENCHMARK_DIR     "" CACHE PATH "Path to the CCD benchmark directory")
diff --git a/cmake/recipes/xsimd.cmake b/cmake/recipes/xsimd.cmake
@@ -0,0 +1,15 @@
+# xsimd (https://github.com/xtensor-stack/xsimd)
+# License: BSD-3-Clause
+if(TARGET xsimd::xsimd)
+  return()
+endif()
+
+message(STATUS "Third-party: creating target 'xsimd::xsimd'")
+
+include(CPM)
+CPMAddPackage("gh:xtensor-stack/xsimd#14.0.0")
+
+add_library(xsimd::xsimd ALIAS xsimd)
+
+# Folder name for IDE
+set_target_properties(xsimd PROPERTIES FOLDER "ThirdParty")
diff --git a/src/ipc/broad_phase/lbvh.cpp b/src/ipc/broad_phase/lbvh.cpp
@@ -9,11 +9,16 @@
 #include <tbb/parallel_for.h>
 #include <tbb/parallel_sort.h>
 
-#ifdef __APPLE__
-// We utilize SIMD registers to compare 1 Node against 4 Queries simultaneously.
-#include <simd/simd.h>
+#ifdef IPC_TOOLKIT_WITH_SIMD
+// We utilize SIMD registers to compare one node against multiple queries
+// simultaneously, with the number of queries determined by
+// xs::batch<float>::size.
+#include <xsimd/xsimd.hpp>
+namespace xs = xsimd;
 #endif
 
+#include <array>
+
 using namespace std::placeholders;
 
 namespace ipc {
@@ -448,9 +453,9 @@ namespace {
         } while (node_idx != LBVH::Node::INVALID_POINTER); // Same as root
     }
 
-#ifdef __APPLE__
+#ifdef IPC_TOOLKIT_WITH_SIMD
     // SIMD Traversal
-    // Traverses 4 queries simultaneously using SIMD.
+    // Traverses multiple queries simultaneously using SIMD.
     template <typename Candidate, bool swap_order, bool triangular>
     void traverse_lbvh_simd(
         const LBVH::Node* queries,
@@ -459,28 +464,37 @@ namespace {
         const std::function<bool(size_t, size_t)>& can_collide,
         std::vector<Candidate>& candidates)
     {
-        assert(n_queries >= 1 && n_queries <= 4);
-        // Load 4 queries into single registers (Structure of Arrays)
-        auto make_simd = [&](auto F) -> simd_float4 {
-            return simd_float4 {
-                F(0),
-                n_queries > 1 ? F(1) : 0.0f,
-                n_queries > 2 ? F(2) : 0.0f,
-                n_queries > 3 ? F(3) : 0.0f,
-            };
+        using batch_t = xs::batch<float>;
+        assert(n_queries >= 1 && n_queries <= batch_t::size);
+
+        // Load queries into single registers
+        auto make_simd = [&](auto F) -> batch_t {
+            // 1. Create a buffer of the correct architecture-dependent size
+            alignas(xs::default_arch::alignment())
+                std::array<float, batch_t::size>
+                    buffer {};
+
+#pragma unroll
+            // 2. Fill the buffer, respecting the actual number of queries
+            for (size_t i = 0; i < batch_t::size; ++i) {
+                buffer[i] = (i < n_queries) ? F(static_cast<int>(i)) : 0.0f;
+            }
+
+            // 3. Load the buffer into the SIMD register
+            return batch_t::load_aligned(buffer.data());
         };
 
-        const simd_float4 q_min_x =
+        const auto q_min_x =
             make_simd([&](int k) { return queries[k].aabb_min.x(); });
-        const simd_float4 q_min_y =
+        const auto q_min_y =
             make_simd([&](int k) { return queries[k].aabb_min.y(); });
-        const simd_float4 q_min_z =
+        const auto q_min_z =
             make_simd([&](int k) { return queries[k].aabb_min.z(); });
-        const simd_float4 q_max_x =
+        const auto q_max_x =
             make_simd([&](int k) { return queries[k].aabb_max.x(); });
-        const simd_float4 q_max_y =
+        const auto q_max_y =
             make_simd([&](int k) { return queries[k].aabb_max.y(); });
-        const simd_float4 q_max_z =
+        const auto q_max_z =
             make_simd([&](int k) { return queries[k].aabb_max.z(); });
 
         // Use a fixed-size array as a stack to avoid dynamic allocations
@@ -505,31 +519,33 @@ namespace {
             const LBVH::Node& child_l = lbvh[node.left];
             const LBVH::Node& child_r = lbvh[node.right];
 
-            // 1. Intersect 4 queries at once
+            // 1. Intersect multiple queries at once
             // (child_l.min <= query.max) && (query.min <= child_l.max)
-            const simd_int4 intersects_l = (child_l.aabb_min.x() <= q_max_x)
+            const xs::batch_bool<float> intersects_l =
+                (child_l.aabb_min.x() <= q_max_x)
                 & (child_l.aabb_min.y() <= q_max_y)
                 & (child_l.aabb_min.z() <= q_max_z)
                 & (q_min_x <= child_l.aabb_max.x())
                 & (q_min_y <= child_l.aabb_max.y())
                 & (q_min_z <= child_l.aabb_max.z());
 
-            // 2. Intersect 4 queries at once
+            // 2. Intersect multiple queries at once
             // (child_r.min <= query.max) && (query.min <= child_r.max)
-            const simd_int4 intersects_r = (child_r.aabb_min.x() <= q_max_x)
+            const xs::batch_bool<float> intersects_r =
+                (child_r.aabb_min.x() <= q_max_x)
                 & (child_r.aabb_min.y() <= q_max_y)
                 & (child_r.aabb_min.z() <= q_max_z)
                 & (q_min_x <= child_r.aabb_max.x())
                 & (q_min_y <= child_r.aabb_max.y())
                 & (q_min_z <= child_r.aabb_max.z());
 
-            const bool any_intersects_l = simd_any(intersects_l);
-            const bool any_intersects_r = simd_any(intersects_r);
+            const bool any_intersects_l = xs::any(intersects_l);
+            const bool any_intersects_r = xs::any(intersects_r);
 
             // Query overlaps a leaf node => report collision
             if (any_intersects_l && child_l.is_leaf()) {
                 for (int k = 0; k < n_queries; ++k) {
-                    if (intersects_l[k]) {
+                    if (intersects_l.get(k)) {
                         attempt_add_candidate<
                             Candidate, swap_order, triangular>(
                             queries[k], child_l, can_collide, candidates);
@@ -538,7 +554,7 @@ namespace {
             }
             if (any_intersects_r && child_r.is_leaf()) {
                 for (int k = 0; k < n_queries; ++k) {
-                    if (intersects_r[k]) {
+                    if (intersects_r.get(k)) {
                         attempt_add_candidate<
                             Candidate, swap_order, triangular>(
                             queries[k], child_r, can_collide, candidates);
@@ -576,9 +592,12 @@ namespace {
         const std::function<bool(size_t, size_t)>& can_collide,
         tbb::enumerable_thread_specific<std::vector<Candidate>>& storage)
     {
-#ifdef __APPLE__ // Only support SIMD on Apple platforms for now
-        constexpr size_t SIMD_SIZE = use_simd ? 4 : 1;
-        constexpr size_t GRAIN_SIZE = use_simd ? 16 : 1;
+#ifdef IPC_TOOLKIT_WITH_SIMD // Enable SIMD acceleration when available
+        constexpr size_t SIMD_SIZE = use_simd ? xs::batch<float>::size : 1;
+        static_assert(
+            64 % xs::batch<float>::size == 0, "GRAIN_SIZE must be an integer");
+        constexpr size_t GRAIN_SIZE =
+            use_simd ? (64 / xs::batch<float>::size) : 1;
 #else
         constexpr size_t SIMD_SIZE = 1;
         constexpr size_t GRAIN_SIZE = 1;
@@ -595,11 +614,13 @@ namespace {
             tbb::blocked_range<size_t>(size_t(0), n_tasks, GRAIN_SIZE),
             [&](const tbb::blocked_range<size_t>& r) {
                 auto& local_candidates = storage.local();
+#ifdef IPC_TOOLKIT_WITH_SIMD
                 const size_t actual_end = // Handle tail case
                     std::min(SIMD_SIZE * r.end(), n_source_leaves);
+#endif
                 for (size_t i = r.begin(); i < r.end(); ++i) {
                     const size_t idx = SIMD_SIZE * i;
-#ifdef __APPLE__
+#ifdef IPC_TOOLKIT_WITH_SIMD
                     if constexpr (use_simd) {
                         assert(actual_end - idx >= 1);
                         traverse_lbvh_simd<Candidate, swap_order, triangular>(
@@ -611,7 +632,7 @@ namespace {
                         traverse_lbvh<Candidate, swap_order, triangular>(
                             source[source_leaf_offset + idx], target,
                             can_collide, local_candidates);
-#ifdef __APPLE__
+#ifdef IPC_TOOLKIT_WITH_SIMD
                     }
 #endif
                 }
diff --git a/src/ipc/config.hpp.in b/src/ipc/config.hpp.in
@@ -13,6 +13,7 @@
 #cmakedefine IPC_TOOLKIT_WITH_INEXACT_CCD
 #cmakedefine IPC_TOOLKIT_WITH_RATIONAL_INTERSECTION
 #cmakedefine IPC_TOOLKIT_WITH_CUDA
+#cmakedefine IPC_TOOLKIT_WITH_SIMD
 #cmakedefine IPC_TOOLKIT_WITH_ROBIN_MAP
 #cmakedefine IPC_TOOLKIT_WITH_ABSEIL
 #cmakedefine IPC_TOOLKIT_WITH_FILIB
diff --git a/src/ipc/utils/eigen_ext.hpp b/src/ipc/utils/eigen_ext.hpp
@@ -5,6 +5,13 @@
 
 #include <cassert>
 
+#ifdef EIGEN_DONT_VECTORIZE
+// NOTE: Avoid error about abs casting double to int. Eigen does this
+// internally but seemingly only if EIGEN_DONT_VECTORIZE is not defined.
+// TODO: We should always use std::abs to avoid this issue.
+EIGEN_USING_STD(abs); // using std::abs;
+#endif
+
 namespace Eigen {
 template <typename T> using RowRef = Ref<T, 0, Eigen::InnerStride<>>;
 template <typename T> using ConstRef = const Ref<const T>&;

Original file line number	Diff line number	Diff line change
`@@ -51,16 +51,6 @@`
`51`	`51`	`"IPC_TOOLKIT_WITH_CUDA": "ON"`
`52`	`52`	`}`
`53`	`53`	`},`
`54`		`- {`
`55`		`- "name": "simd",`
`56`		`- "inherits": "release",`
`57`		`- "displayName": "SIMD Enabled",`
`58`		`- "description": "Build with SIMD optimizations",`
`59`		`- "binaryDir": "${sourceDir}/build/simd",`
`60`		`- "cacheVariables": {`
`61`		`- "IPC_TOOLKIT_WITH_SIMD": "ON"`
`62`		`- }`
`63`		`- },`
`64`	`54`	`{`
`65`	`55`	`"name": "test",`
`66`	`56`	`"inherits": "debug",`
`@@ -82,7 +72,6 @@`
`82`	`72`	`"cacheVariables": {`
`83`	`73`	`"IPC_TOOLKIT_BUILD_PYTHON": "ON",`
`84`	`74`	`"IPC_TOOLKIT_BUILD_TESTS": "OFF",`
`85`		`- "IPC_TOOLKIT_WITH_SIMD": "OFF",`
`86`	`75`	`"IPC_TOOLKIT_WITH_CUDA": "OFF"`
`87`	`76`	`}`
`88`	`77`	`},`
`@@ -166,4 +155,4 @@`
`166`	`155`	`}`
`167`	`156`	`}`
`168`	`157`	`]`
`169`		`-}`
	`158`	`+}`