99#include < tbb/parallel_for.h>
1010#include < tbb/parallel_sort.h>
1111
12- #ifdef __APPLE__
13- // We utilize SIMD registers to compare 1 Node against 4 Queries simultaneously.
14- #include < simd/simd.h>
12+ #ifdef IPC_TOOLKIT_WITH_SIMD
13+ // We utilize SIMD registers to compare one node against multiple queries
14+ // simultaneously, with the number of queries determined by
15+ // xs::batch<float>::size.
16+ #include < xsimd/xsimd.hpp>
17+ namespace xs = xsimd;
1518#endif
1619
20+ #include < array>
21+
1722using namespace std ::placeholders;
1823
1924namespace ipc {
@@ -448,9 +453,9 @@ namespace {
448453 } while (node_idx != LBVH::Node::INVALID_POINTER); // Same as root
449454 }
450455
451- #ifdef __APPLE__
456+ #ifdef IPC_TOOLKIT_WITH_SIMD
452457 // SIMD Traversal
453- // Traverses 4 queries simultaneously using SIMD.
458+ // Traverses multiple queries simultaneously using SIMD.
454459 template <typename Candidate, bool swap_order, bool triangular>
455460 void traverse_lbvh_simd (
456461 const LBVH::Node* queries,
@@ -459,28 +464,37 @@ namespace {
459464 const std::function<bool (size_t , size_t )>& can_collide,
460465 std::vector<Candidate>& candidates)
461466 {
462- assert (n_queries >= 1 && n_queries <= 4 );
463- // Load 4 queries into single registers (Structure of Arrays)
464- auto make_simd = [&](auto F) -> simd_float4 {
465- return simd_float4 {
466- F (0 ),
467- n_queries > 1 ? F (1 ) : 0 .0f ,
468- n_queries > 2 ? F (2 ) : 0 .0f ,
469- n_queries > 3 ? F (3 ) : 0 .0f ,
470- };
467+ using batch_t = xs::batch<float >;
468+ assert (n_queries >= 1 && n_queries <= batch_t ::size);
469+
470+ // Load queries into single registers
471+ auto make_simd = [&](auto F) -> batch_t {
472+ // 1. Create a buffer of the correct architecture-dependent size
473+ alignas (xs::default_arch::alignment ())
474+ std::array<float , batch_t ::size>
475+ buffer {};
476+
477+ #pragma unroll
478+ // 2. Fill the buffer, respecting the actual number of queries
479+ for (size_t i = 0 ; i < batch_t ::size; ++i) {
480+ buffer[i] = (i < n_queries) ? F (static_cast <int >(i)) : 0 .0f ;
481+ }
482+
483+ // 3. Load the buffer into the SIMD register
484+ return batch_t::load_aligned (buffer.data ());
471485 };
472486
473- const simd_float4 q_min_x =
487+ const auto q_min_x =
474488 make_simd ([&](int k) { return queries[k].aabb_min .x (); });
475- const simd_float4 q_min_y =
489+ const auto q_min_y =
476490 make_simd ([&](int k) { return queries[k].aabb_min .y (); });
477- const simd_float4 q_min_z =
491+ const auto q_min_z =
478492 make_simd ([&](int k) { return queries[k].aabb_min .z (); });
479- const simd_float4 q_max_x =
493+ const auto q_max_x =
480494 make_simd ([&](int k) { return queries[k].aabb_max .x (); });
481- const simd_float4 q_max_y =
495+ const auto q_max_y =
482496 make_simd ([&](int k) { return queries[k].aabb_max .y (); });
483- const simd_float4 q_max_z =
497+ const auto q_max_z =
484498 make_simd ([&](int k) { return queries[k].aabb_max .z (); });
485499
486500 // Use a fixed-size array as a stack to avoid dynamic allocations
@@ -505,31 +519,33 @@ namespace {
505519 const LBVH::Node& child_l = lbvh[node.left ];
506520 const LBVH::Node& child_r = lbvh[node.right ];
507521
508- // 1. Intersect 4 queries at once
522+ // 1. Intersect multiple queries at once
509523 // (child_l.min <= query.max) && (query.min <= child_l.max)
510- const simd_int4 intersects_l = (child_l.aabb_min .x () <= q_max_x)
524+ const xs::batch_bool<float > intersects_l =
525+ (child_l.aabb_min .x () <= q_max_x)
511526 & (child_l.aabb_min .y () <= q_max_y)
512527 & (child_l.aabb_min .z () <= q_max_z)
513528 & (q_min_x <= child_l.aabb_max .x ())
514529 & (q_min_y <= child_l.aabb_max .y ())
515530 & (q_min_z <= child_l.aabb_max .z ());
516531
517- // 2. Intersect 4 queries at once
532+ // 2. Intersect multiple queries at once
518533 // (child_r.min <= query.max) && (query.min <= child_r.max)
519- const simd_int4 intersects_r = (child_r.aabb_min .x () <= q_max_x)
534+ const xs::batch_bool<float > intersects_r =
535+ (child_r.aabb_min .x () <= q_max_x)
520536 & (child_r.aabb_min .y () <= q_max_y)
521537 & (child_r.aabb_min .z () <= q_max_z)
522538 & (q_min_x <= child_r.aabb_max .x ())
523539 & (q_min_y <= child_r.aabb_max .y ())
524540 & (q_min_z <= child_r.aabb_max .z ());
525541
526- const bool any_intersects_l = simd_any (intersects_l);
527- const bool any_intersects_r = simd_any (intersects_r);
542+ const bool any_intersects_l = xs::any (intersects_l);
543+ const bool any_intersects_r = xs::any (intersects_r);
528544
529545 // Query overlaps a leaf node => report collision
530546 if (any_intersects_l && child_l.is_leaf ()) {
531547 for (int k = 0 ; k < n_queries; ++k) {
532- if (intersects_l[k] ) {
548+ if (intersects_l. get (k) ) {
533549 attempt_add_candidate<
534550 Candidate, swap_order, triangular>(
535551 queries[k], child_l, can_collide, candidates);
@@ -538,7 +554,7 @@ namespace {
538554 }
539555 if (any_intersects_r && child_r.is_leaf ()) {
540556 for (int k = 0 ; k < n_queries; ++k) {
541- if (intersects_r[k] ) {
557+ if (intersects_r. get (k) ) {
542558 attempt_add_candidate<
543559 Candidate, swap_order, triangular>(
544560 queries[k], child_r, can_collide, candidates);
@@ -576,9 +592,12 @@ namespace {
576592 const std::function<bool (size_t , size_t )>& can_collide,
577593 tbb::enumerable_thread_specific<std::vector<Candidate>>& storage)
578594 {
579- #ifdef __APPLE__ // Only support SIMD on Apple platforms for now
580- constexpr size_t SIMD_SIZE = use_simd ? 4 : 1 ;
581- constexpr size_t GRAIN_SIZE = use_simd ? 16 : 1 ;
595+ #ifdef IPC_TOOLKIT_WITH_SIMD // Enable SIMD acceleration when available
596+ constexpr size_t SIMD_SIZE = use_simd ? xs::batch<float >::size : 1 ;
597+ static_assert (
598+ 64 % xs::batch<float >::size == 0 , " GRAIN_SIZE must be an integer" );
599+ constexpr size_t GRAIN_SIZE =
600+ use_simd ? (64 / xs::batch<float >::size) : 1 ;
582601#else
583602 constexpr size_t SIMD_SIZE = 1 ;
584603 constexpr size_t GRAIN_SIZE = 1 ;
@@ -595,11 +614,13 @@ namespace {
595614 tbb::blocked_range<size_t >(size_t (0 ), n_tasks, GRAIN_SIZE),
596615 [&](const tbb::blocked_range<size_t >& r) {
597616 auto & local_candidates = storage.local ();
617+ #ifdef IPC_TOOLKIT_WITH_SIMD
598618 const size_t actual_end = // Handle tail case
599619 std::min (SIMD_SIZE * r.end (), n_source_leaves);
620+ #endif
600621 for (size_t i = r.begin (); i < r.end (); ++i) {
601622 const size_t idx = SIMD_SIZE * i;
602- #ifdef __APPLE__
623+ #ifdef IPC_TOOLKIT_WITH_SIMD
603624 if constexpr (use_simd) {
604625 assert (actual_end - idx >= 1 );
605626 traverse_lbvh_simd<Candidate, swap_order, triangular>(
@@ -611,7 +632,7 @@ namespace {
611632 traverse_lbvh<Candidate, swap_order, triangular>(
612633 source[source_leaf_offset + idx], target,
613634 can_collide, local_candidates);
614- #ifdef __APPLE__
635+ #ifdef IPC_TOOLKIT_WITH_SIMD
615636 }
616637#endif
617638 }
0 commit comments