@@ -27,8 +27,25 @@ THE SOFTWARE.
27
27
28
28
#define PARALLEL_BUILD
29
29
30
+ // Macro for allocating 16-byte aligned stack memory
31
+ #define STACK_ALLOC (COUNT, TYPE ) static_cast <TYPE *>(Align(16 , (COUNT) * sizeof (TYPE), alloca(RoundUp(16 , (COUNT) * sizeof (TYPE)))))
32
+
30
33
namespace RadeonRays
31
34
{
35
+ inline
36
+ std::size_t RoundUp (std::size_t alignment, std::size_t size)
37
+ {
38
+ return (size + alignment - 1 ) & ~(alignment - 1 );
39
+ }
40
+
41
+ inline
42
+ void *Align (std::size_t alignment, std::size_t size, void *ptr)
43
+ {
44
+ std::size_t space;
45
+ void *aligned_ptr = std::align (alignment, RoundUp (alignment, size), ptr, space);
46
+ return aligned_ptr;
47
+ }
48
+
32
49
#ifdef __GNUC__
33
50
#define clz (x ) __builtin_clz(x)
34
51
#define ctz (x ) __builtin_ctz(x)
@@ -314,14 +331,13 @@ namespace RadeonRays
314
331
{
315
332
auto sah = std::numeric_limits<float >::max ();
316
333
317
- // TODO: should use the args passed at construction time (gboisse)
318
- auto constexpr kNumBins = 64u ;
319
- std::uint32_t bin_count[kNumBins ];
320
- _MM_ALIGN16 __m128 bin_min[kNumBins ];
321
- _MM_ALIGN16 __m128 bin_max[kNumBins ];
334
+ // Allocate stack memory
335
+ auto bin_count = STACK_ALLOC (m_num_bins, std::uint32_t );
336
+ auto bin_min = STACK_ALLOC (m_num_bins, __m128);
337
+ auto bin_max = STACK_ALLOC (m_num_bins, __m128);
322
338
323
339
auto constexpr inf = std::numeric_limits<float >::infinity ();
324
- for (auto i = 0u ; i < kNumBins ; ++i)
340
+ for (auto i = 0u ; i < m_num_bins ; ++i)
325
341
{
326
342
bin_count[i] = 0 ;
327
343
bin_min[i] = _mm_set_ps (inf, inf, inf, inf);
@@ -346,8 +362,8 @@ namespace RadeonRays
346
362
347
363
auto full4 = request.num_refs & ~0x3 ;
348
364
auto num_bins = _mm_set_ps (
349
- static_cast <float >(kNumBins ), static_cast <float >(kNumBins ),
350
- static_cast <float >(kNumBins ), static_cast <float >(kNumBins ));
365
+ static_cast <float >(m_num_bins ), static_cast <float >(m_num_bins ),
366
+ static_cast <float >(m_num_bins ), static_cast <float >(m_num_bins ));
351
367
352
368
for (auto i = request.start_index ;
353
369
i < request.start_index + full4;
@@ -369,10 +385,10 @@ namespace RadeonRays
369
385
_mm_sub_ps (c, centroid_min),
370
386
centroid_extent_inv), num_bins);
371
387
372
- auto bin_idx0 = std::min (static_cast <uint32_t >(mm_select (bin_idx, 0u )), kNumBins - 1 );
373
- auto bin_idx1 = std::min (static_cast <uint32_t >(mm_select (bin_idx, 1u )), kNumBins - 1 );
374
- auto bin_idx2 = std::min (static_cast <uint32_t >(mm_select (bin_idx, 2u )), kNumBins - 1 );
375
- auto bin_idx3 = std::min (static_cast <uint32_t >(mm_select (bin_idx, 3u )), kNumBins - 1 );
388
+ auto bin_idx0 = std::min (static_cast <uint32_t >(mm_select (bin_idx, 0u )), m_num_bins - 1 );
389
+ auto bin_idx1 = std::min (static_cast <uint32_t >(mm_select (bin_idx, 1u )), m_num_bins - 1 );
390
+ auto bin_idx2 = std::min (static_cast <uint32_t >(mm_select (bin_idx, 2u )), m_num_bins - 1 );
391
+ auto bin_idx3 = std::min (static_cast <uint32_t >(mm_select (bin_idx, 3u )), m_num_bins - 1 );
376
392
377
393
++bin_count[bin_idx0];
378
394
++bin_count[bin_idx1];
@@ -411,9 +427,9 @@ namespace RadeonRays
411
427
{
412
428
auto idx = refs[i];
413
429
auto bin_idx = std::min (static_cast <uint32_t >(
414
- kNumBins *
430
+ m_num_bins *
415
431
(aabb_centroid[idx][axis] - cm) *
416
- cei), kNumBins - 1 );
432
+ cei), m_num_bins - 1 );
417
433
++bin_count[bin_idx];
418
434
419
435
bin_min[bin_idx] = _mm_min_ps (
@@ -424,12 +440,12 @@ namespace RadeonRays
424
440
_mm_load_ps (&aabb_max[idx].x ));
425
441
}
426
442
427
- _MM_ALIGN16 __m128 right_min[ kNumBins - 1 ] ;
428
- _MM_ALIGN16 __m128 right_max[ kNumBins - 1 ] ;
443
+ auto right_min = STACK_ALLOC (m_num_bins - 1 , __m128) ;
444
+ auto right_max = STACK_ALLOC (m_num_bins - 1 , __m128) ;
429
445
auto tmp_min = _mm_set_ps (inf, inf, inf, inf);
430
446
auto tmp_max = _mm_set_ps (-inf, -inf, -inf, -inf);
431
447
432
- for (auto i = kNumBins - 1 ; i > 0 ; --i)
448
+ for (auto i = m_num_bins - 1 ; i > 0 ; --i)
433
449
{
434
450
tmp_min = _mm_min_ps (tmp_min, bin_min[i]);
435
451
tmp_max = _mm_max_ps (tmp_max, bin_max[i]);
@@ -444,7 +460,7 @@ namespace RadeonRays
444
460
auto rc = request.num_refs ;
445
461
446
462
auto split_idx = -1 ;
447
- for (auto i = 0u ; i < kNumBins - 1 ; ++i)
463
+ for (auto i = 0u ; i < m_num_bins - 1 ; ++i)
448
464
{
449
465
tmp_min = _mm_min_ps (tmp_min, bin_min[i]);
450
466
tmp_max = _mm_max_ps (tmp_max, bin_max[i]);
@@ -465,7 +481,7 @@ namespace RadeonRays
465
481
}
466
482
}
467
483
468
- return mm_select (centroid_min, 0u ) + (split_idx + 1 ) * (mm_select (centroid_extent, 0u ) / kNumBins );
484
+ return mm_select (centroid_min, 0u ) + (split_idx + 1 ) * (mm_select (centroid_extent, 0u ) / m_num_bins );
469
485
}
470
486
471
487
Bvh2::NodeType Bvh2::HandleRequest (
0 commit comments