Skip to content

Commit 1f32a18

Browse files
committed
Added stack allocation
1 parent ff39614 commit 1f32a18

File tree

2 files changed

+36
-20
lines changed

2 files changed

+36
-20
lines changed

RadeonRays/src/accelerator/bvh2.cpp

Lines changed: 35 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,25 @@ THE SOFTWARE.
2727

2828
#define PARALLEL_BUILD
2929

30+
// Macro for allocating 16-byte aligned stack memory
31+
#define STACK_ALLOC(COUNT, TYPE) static_cast<TYPE *>(Align(16, (COUNT) * sizeof(TYPE), alloca(RoundUp(16, (COUNT) * sizeof(TYPE)))))
32+
3033
namespace RadeonRays
3134
{
35+
inline
36+
std::size_t RoundUp(std::size_t alignment, std::size_t size)
37+
{
38+
return (size + alignment - 1) & ~(alignment - 1);
39+
}
40+
41+
inline
42+
void *Align(std::size_t alignment, std::size_t size, void *ptr)
43+
{
44+
std::size_t space;
45+
void *aligned_ptr = std::align(alignment, RoundUp(alignment, size), ptr, space);
46+
return aligned_ptr;
47+
}
48+
3249
#ifdef __GNUC__
3350
#define clz(x) __builtin_clz(x)
3451
#define ctz(x) __builtin_ctz(x)
@@ -314,14 +331,13 @@ namespace RadeonRays
314331
{
315332
auto sah = std::numeric_limits<float>::max();
316333

317-
// TODO: should use the args passed at construction time (gboisse)
318-
auto constexpr kNumBins = 64u;
319-
std::uint32_t bin_count[kNumBins];
320-
_MM_ALIGN16 __m128 bin_min[kNumBins];
321-
_MM_ALIGN16 __m128 bin_max[kNumBins];
334+
// Allocate stack memory
335+
auto bin_count = STACK_ALLOC(m_num_bins, std::uint32_t);
336+
auto bin_min = STACK_ALLOC(m_num_bins, __m128);
337+
auto bin_max = STACK_ALLOC(m_num_bins, __m128);
322338

323339
auto constexpr inf = std::numeric_limits<float>::infinity();
324-
for (auto i = 0u; i < kNumBins; ++i)
340+
for (auto i = 0u; i < m_num_bins; ++i)
325341
{
326342
bin_count[i] = 0;
327343
bin_min[i] = _mm_set_ps(inf, inf, inf, inf);
@@ -346,8 +362,8 @@ namespace RadeonRays
346362

347363
auto full4 = request.num_refs & ~0x3;
348364
auto num_bins = _mm_set_ps(
349-
static_cast<float>(kNumBins), static_cast<float>(kNumBins),
350-
static_cast<float>(kNumBins), static_cast<float>(kNumBins));
365+
static_cast<float>(m_num_bins), static_cast<float>(m_num_bins),
366+
static_cast<float>(m_num_bins), static_cast<float>(m_num_bins));
351367

352368
for (auto i = request.start_index;
353369
i < request.start_index + full4;
@@ -369,10 +385,10 @@ namespace RadeonRays
369385
_mm_sub_ps(c, centroid_min),
370386
centroid_extent_inv), num_bins);
371387

372-
auto bin_idx0 = std::min(static_cast<uint32_t>(mm_select(bin_idx, 0u)), kNumBins - 1);
373-
auto bin_idx1 = std::min(static_cast<uint32_t>(mm_select(bin_idx, 1u)), kNumBins - 1);
374-
auto bin_idx2 = std::min(static_cast<uint32_t>(mm_select(bin_idx, 2u)), kNumBins - 1);
375-
auto bin_idx3 = std::min(static_cast<uint32_t>(mm_select(bin_idx, 3u)), kNumBins - 1);
388+
auto bin_idx0 = std::min(static_cast<uint32_t>(mm_select(bin_idx, 0u)), m_num_bins - 1);
389+
auto bin_idx1 = std::min(static_cast<uint32_t>(mm_select(bin_idx, 1u)), m_num_bins - 1);
390+
auto bin_idx2 = std::min(static_cast<uint32_t>(mm_select(bin_idx, 2u)), m_num_bins - 1);
391+
auto bin_idx3 = std::min(static_cast<uint32_t>(mm_select(bin_idx, 3u)), m_num_bins - 1);
376392

377393
++bin_count[bin_idx0];
378394
++bin_count[bin_idx1];
@@ -411,9 +427,9 @@ namespace RadeonRays
411427
{
412428
auto idx = refs[i];
413429
auto bin_idx = std::min(static_cast<uint32_t>(
414-
kNumBins *
430+
m_num_bins *
415431
(aabb_centroid[idx][axis] - cm) *
416-
cei), kNumBins - 1);
432+
cei), m_num_bins - 1);
417433
++bin_count[bin_idx];
418434

419435
bin_min[bin_idx] = _mm_min_ps(
@@ -424,12 +440,12 @@ namespace RadeonRays
424440
_mm_load_ps(&aabb_max[idx].x));
425441
}
426442

427-
_MM_ALIGN16 __m128 right_min[kNumBins - 1];
428-
_MM_ALIGN16 __m128 right_max[kNumBins - 1];
443+
auto right_min = STACK_ALLOC(m_num_bins - 1, __m128);
444+
auto right_max = STACK_ALLOC(m_num_bins - 1, __m128);
429445
auto tmp_min = _mm_set_ps(inf, inf, inf, inf);
430446
auto tmp_max = _mm_set_ps(-inf, -inf, -inf, -inf);
431447

432-
for (auto i = kNumBins - 1; i > 0; --i)
448+
for (auto i = m_num_bins - 1; i > 0; --i)
433449
{
434450
tmp_min = _mm_min_ps(tmp_min, bin_min[i]);
435451
tmp_max = _mm_max_ps(tmp_max, bin_max[i]);
@@ -444,7 +460,7 @@ namespace RadeonRays
444460
auto rc = request.num_refs;
445461

446462
auto split_idx = -1;
447-
for (auto i = 0u; i < kNumBins - 1; ++i)
463+
for (auto i = 0u; i < m_num_bins - 1; ++i)
448464
{
449465
tmp_min = _mm_min_ps(tmp_min, bin_min[i]);
450466
tmp_max = _mm_max_ps(tmp_max, bin_max[i]);
@@ -465,7 +481,7 @@ namespace RadeonRays
465481
}
466482
}
467483

468-
return mm_select(centroid_min, 0u) + (split_idx + 1) * (mm_select(centroid_extent, 0u) / kNumBins);
484+
return mm_select(centroid_min, 0u) + (split_idx + 1) * (mm_select(centroid_extent, 0u) / m_num_bins);
469485
}
470486

471487
Bvh2::NodeType Bvh2::HandleRequest(

RadeonRays/src/accelerator/bvh2.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ namespace RadeonRays
8484
// Node traversal cost
8585
float m_traversal_cost;
8686
// Number of spatial bins to use for SAH
87-
int m_num_bins;
87+
uint32_t m_num_bins;
8888

8989
static void *Allocate(std::size_t size, std::size_t alignment)
9090
{

0 commit comments

Comments
 (0)