Skip to content

Commit 13a4fd2

Browse files
committed
Update to 3.0.9ba63f3
1 parent 7bba606 commit 13a4fd2

File tree

12 files changed

+183
-150
lines changed

12 files changed

+183
-150
lines changed

hiprt/hiprt_common.h

Lines changed: 68 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -51,17 +51,6 @@
5151
#include <cstdint>
5252
#endif
5353

54-
#ifdef __CUDACC__
55-
// Switch to sync counterparts as CUDA recently deprecated the non-sync ones
56-
#define __shfl( x, y ) __shfl_sync( __activemask(), ( x ), ( y ) )
57-
#define __shfl_up( x, y ) __shfl_up_sync( __activemask(), ( x ), ( y ) )
58-
#define __shfl_down( x, y ) __shfl_down_sync( __activemask(), ( x ), ( y ) )
59-
#define __shfl_xor( x, y ) __shfl_xor_sync( __activemask(), ( x ), ( y ) )
60-
#define __ballot( x ) __ballot_sync( __activemask(), ( x ) )
61-
#define __any( x ) __any_sync( __activemask(), ( x ) )
62-
#define __all( x ) __all_sync( __activemask(), ( x ) )
63-
#endif
64-
6554
#if !defined( __KERNELCC__ )
6655
#if defined( _MSC_VER )
6756
#define HIPRT_ASSERT( cond ) \
@@ -265,6 +254,74 @@ HIPRT_HOST_DEVICE HIPRT_INLINE uint32_t clz( uint32_t value )
265254
#endif
266255
}
267256

257+
#ifdef __KERNELCC__
258+
template <typename T>
259+
HIPRT_INLINE HIPRT_DEVICE T shfl( T var, int srcLane )
260+
{
261+
#ifdef __CUDACC__
262+
return __shfl_sync( __activemask(), var, srcLane );
263+
#else
264+
return __shfl( var, srcLane );
265+
#endif
266+
}
267+
268+
template <typename T>
269+
HIPRT_INLINE HIPRT_DEVICE T shfl_up( T var, int srcLane )
270+
{
271+
#ifdef __CUDACC__
272+
return __shfl_up_sync( __activemask(), var, srcLane );
273+
#else
274+
return __shfl_up( var, srcLane );
275+
#endif
276+
}
277+
278+
template <typename T>
279+
HIPRT_INLINE HIPRT_DEVICE T shfl_down( T var, int srcLane )
280+
{
281+
#ifdef __CUDACC__
282+
return __shfl_down_sync( __activemask(), var, srcLane );
283+
#else
284+
return __shfl_down( var, srcLane );
285+
#endif
286+
}
287+
288+
template <typename T>
289+
HIPRT_INLINE HIPRT_DEVICE T shfl_xor( T var, int srcLane )
290+
{
291+
#ifdef __CUDACC__
292+
return __shfl_xor_sync( __activemask(), var, srcLane );
293+
#else
294+
return __shfl_xor( var, srcLane );
295+
#endif
296+
}
297+
298+
HIPRT_INLINE HIPRT_DEVICE uint64_t ballot( int predicate )
299+
{
300+
#ifdef __CUDACC__
301+
return static_cast<uint64_t>( __ballot_sync( __activemask(), predicate ) );
302+
#else
303+
return __ballot( predicate );
304+
#endif
305+
}
306+
307+
HIPRT_INLINE HIPRT_DEVICE uint32_t any( int predicate )
308+
{
309+
#ifdef __CUDACC__
310+
return __any_sync( __activemask(), predicate );
311+
#else
312+
return __any( predicate );
313+
#endif
314+
}
315+
HIPRT_INLINE HIPRT_DEVICE uint32_t all( int predicate )
316+
{
317+
#ifdef __CUDACC__
318+
return __all_sync( __activemask(), predicate );
319+
#else
320+
return __all( predicate );
321+
#endif
322+
}
323+
#endif
324+
268325
template <typename T, typename U>
269326
constexpr HIPRT_HOST_DEVICE T RoundUp( T value, U factor )
270327
{

hiprt/impl/Aabb.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -124,12 +124,12 @@ class Aabb
124124
HIPRT_DEVICE Aabb shuffle( uint32_t index )
125125
{
126126
Aabb aabb;
127-
aabb.m_min.x = __shfl( m_min.x, index );
128-
aabb.m_min.y = __shfl( m_min.x, index );
129-
aabb.m_min.z = __shfl( m_min.x, index );
130-
aabb.m_max.x = __shfl( m_max.x, index );
131-
aabb.m_max.y = __shfl( m_max.y, index );
132-
aabb.m_max.z = __shfl( m_max.z, index );
127+
aabb.m_min.x = shfl( m_min.x, index );
128+
aabb.m_min.y = shfl( m_min.x, index );
129+
aabb.m_min.z = shfl( m_min.x, index );
130+
aabb.m_max.x = shfl( m_max.x, index );
131+
aabb.m_max.y = shfl( m_max.y, index );
132+
aabb.m_max.z = shfl( m_max.z, index );
133133
return aabb;
134134
}
135135
#endif

hiprt/impl/BvhBuilderKernels.h

Lines changed: 39 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ HIPRT_DEVICE HIPRT_INLINE void openNodes(
137137
<< static_cast<uint64_t>( ( BranchingFactor * subwarpIndex ) );
138138

139139
bool done = childCount == BranchingFactor;
140-
while ( __ballot( !done ) )
140+
while ( hiprt::ballot( !done ) )
141141
{
142142
sync_warp();
143143

@@ -153,13 +153,13 @@ HIPRT_DEVICE HIPRT_INLINE void openNodes(
153153
float maxArea = area;
154154
#pragma unroll
155155
for ( uint32_t i = 1; i < BranchingFactor; i <<= 1 )
156-
maxArea = hiprt::max( maxArea, __shfl_xor( maxArea, i ) );
156+
maxArea = hiprt::max( maxArea, shfl_xor( maxArea, i ) );
157157
if ( maxArea < 0.0f ) done = true;
158158

159159
const uint32_t maxLaneIndex =
160-
__ffsll( static_cast<unsigned long long>( __ballot( maxArea == area ) ) & subwarpMask ) - 1;
160+
__ffsll( static_cast<unsigned long long>( hiprt::ballot( maxArea == area ) ) & subwarpMask ) - 1;
161161
const uint32_t maxIndex = maxLaneIndex % BranchingFactor;
162-
const uint32_t maxChildIndex = __shfl( childIndex, maxLaneIndex );
162+
const uint32_t maxChildIndex = shfl( childIndex, maxLaneIndex );
163163

164164
if ( !done )
165165
{
@@ -366,37 +366,35 @@ extern "C" __global__ void PairTriangles( TriangleMesh mesh, uint2* pairIndices,
366366

367367
bool valid = index < mesh.getCount();
368368
uint32_t pairedIndex = InvalidValue;
369-
uint64_t activeMask = __ballot( valid );
369+
uint64_t activeMask = hiprt::ballot( valid );
370370

371371
uint3 triIndices;
372372
if ( valid ) triIndices = mesh.fetchTriangleIndices( index );
373373

374374
while ( activeMask )
375375
{
376-
activeMask = __shfl( activeMask, 0 );
376+
activeMask = shfl( activeMask, 0 );
377377

378378
const uint64_t broadcastLane = __ffsll( static_cast<unsigned long long>( activeMask ) ) - 1;
379379
if ( laneIndex == broadcastLane ) valid = false;
380380

381381
activeMask &= activeMask - 1;
382382

383-
const uint32_t broadcastIndex = __shfl( index, broadcastLane );
383+
const uint32_t broadcastIndex = shfl( index, broadcastLane );
384384
const uint3 triIndicesBroadcast = {
385-
__shfl( triIndices.x, broadcastLane ),
386-
__shfl( triIndices.y, broadcastLane ),
387-
__shfl( triIndices.z, broadcastLane ) };
385+
shfl( triIndices.x, broadcastLane ), shfl( triIndices.y, broadcastLane ), shfl( triIndices.z, broadcastLane ) };
388386

389387
bool pairable = false;
390388
if ( index != broadcastIndex && valid )
391389
pairable = tryPairTriangles( triIndicesBroadcast, triIndices ).x != InvalidValue;
392390

393-
const uint32_t firstPairedLane = __ffsll( static_cast<unsigned long long>( __ballot( pairable ) ) ) - 1;
391+
const uint32_t firstPairedLane = __ffsll( static_cast<unsigned long long>( hiprt::ballot( pairable ) ) ) - 1;
394392
if ( firstPairedLane < WarpSize )
395393
{
396394
activeMask &= ~( 1u << firstPairedLane );
397395
if ( laneIndex == firstPairedLane ) valid = false;
398396

399-
const uint32_t secondIndex = __shfl( index, firstPairedLane );
397+
const uint32_t secondIndex = shfl( index, firstPairedLane );
400398
if ( laneIndex == broadcastLane ) pairedIndex = secondIndex;
401399
}
402400
else if ( laneIndex == broadcastLane )
@@ -714,10 +712,10 @@ __device__ void FitBounds( Header* header, PrimitiveContainer& primitives, BoxNo
714712
internal = laneIndex < childCount && node.getChildType( laneIndex ) == BoxType;
715713
}
716714

717-
uint32_t internalCount = __popcll( __ballot( internal ) & subwarpMask );
715+
uint32_t internalCount = __popcll( hiprt::ballot( internal ) & subwarpMask );
718716
if ( internalCount > 0 ) done = true;
719717

720-
while ( __any( !done ) )
718+
while ( hiprt::any( !done ) )
721719
{
722720
__threadfence();
723721

@@ -755,13 +753,13 @@ __device__ void FitBounds( Header* header, PrimitiveContainer& primitives, BoxNo
755753
internal = laneIndex < childCount && node.getChildType( laneIndex ) == BoxType;
756754
}
757755

758-
internalCount = __popcll( __ballot( internal ) & subwarpMask );
756+
internalCount = __popcll( hiprt::ballot( internal ) & subwarpMask );
759757

760758
__threadfence();
761759

762760
if ( !done && sublaneIndex == 0 && atomicAdd( &boxNodes[index].m_updateCounter, 1 ) < internalCount - 1 ) done = true;
763761

764-
done = __shfl( done, subwarpIndex * BranchingFactor );
762+
done = shfl( done, subwarpIndex * BranchingFactor );
765763
}
766764
}
767765

@@ -827,11 +825,11 @@ __device__ void FitOrientedBounds(
827825
uint32_t childCount = node.getChildCount();
828826
bool internal = laneIndex < childCount && node.getChildType( laneIndex ) == BoxType;
829827

830-
uint32_t internalCount = __popcll( __ballot( internal ) );
828+
uint32_t internalCount = __popcll( hiprt::ballot( internal ) );
831829

832830
bool done = internalCount > 0;
833831

834-
while ( __any( !done ) )
832+
while ( hiprt::any( !done ) )
835833
{
836834
__threadfence();
837835

@@ -858,9 +856,9 @@ __device__ void FitOrientedBounds(
858856
}
859857
}
860858

861-
const float minArea = warpMin( minAreaLane );
862-
const uint32_t minIndex = __ffsll( static_cast<unsigned long long>( __ballot( minAreaLane == minArea ) ) ) - 1;
863-
const uint32_t matrixIndex = __shfl( minIndexLane, minIndex );
859+
const float minArea = warpMin( minAreaLane );
860+
const uint32_t minIndex = __ffsll( static_cast<unsigned long long>( hiprt::ballot( minAreaLane == minArea ) ) ) - 1;
861+
const uint32_t matrixIndex = shfl( minIndexLane, minIndex );
864862

865863
Aabb childBox;
866864
uint32_t childIndex;
@@ -889,13 +887,13 @@ __device__ void FitOrientedBounds(
889887
childCount = node.getChildCount();
890888
internal = laneIndex < childCount && node.getChildType( laneIndex ) == BoxType;
891889

892-
internalCount = __popcll( __ballot( internal ) );
890+
internalCount = __popcll( hiprt::ballot( internal ) );
893891

894892
__threadfence();
895893

896894
if ( laneIndex == 0 && atomicAdd( &updateCounters[index], 1 ) < internalCount - 1 ) done = true;
897895

898-
done = __shfl( done, 0 );
896+
done = shfl( done, 0 );
899897
}
900898
}
901899

@@ -1083,7 +1081,7 @@ __device__ void Collapse(
10831081

10841082
bool done = taskIndex >= maxBoxNodeCount || taskIndex >= referenceCount;
10851083

1086-
while ( __any( !done ) )
1084+
while ( hiprt::any( !done ) )
10871085
{
10881086
sync_warp();
10891087
__threadfence();
@@ -1106,7 +1104,7 @@ __device__ void Collapse(
11061104
const bool valid = nodeIndex != InvalidValue && nodeAddr != InvalidValue && parentAddr != InvalidValue;
11071105

11081106
// skip inactive warps
1109-
if ( __all( !valid ) ) continue;
1107+
if ( hiprt::all( !valid ) ) continue;
11101108

11111109
Aabb childBox;
11121110
uint32_t childIndex = InvalidValue;
@@ -1115,8 +1113,8 @@ __device__ void Collapse(
11151113
if ( nodeAddr == 0 ) parentAddr = InvalidValue;
11161114

11171115
// fill inactive lanes with first valid node index
1118-
const uint32_t firstValidLane = __ffsll( __ballot( valid ) ) - 1;
1119-
nodeIndex = __shfl( nodeIndex, valid ? laneIndex : firstValidLane );
1116+
const uint32_t firstValidLane = __ffsll( static_cast<unsigned long long>( hiprt::ballot( valid ) ) ) - 1;
1117+
nodeIndex = shfl( nodeIndex, valid ? laneIndex : firstValidLane );
11201118

11211119
BinaryNode binaryNode = binaryNodes[getNodeAddr( nodeIndex )];
11221120
if ( sublaneIndex < 2 )
@@ -1136,7 +1134,7 @@ __device__ void Collapse(
11361134

11371135
const bool internal = isInternalNode( childIndex ) && !isFatLeafNode( childIndex );
11381136
const uint32_t childAddr = warpOffset( active && internal, &header->m_boxNodeCount );
1139-
const uint32_t internalBase = __shfl( childAddr, subwarpIndex * BranchingFactor );
1137+
const uint32_t internalBase = shfl( childAddr, subwarpIndex * BranchingFactor );
11401138
if ( active && internal )
11411139
{
11421140
atomic_store( &taskQueue[childAddr], { childIndex, childAddr, nodeAddr } );
@@ -1166,7 +1164,7 @@ __device__ void Collapse(
11661164
if constexpr ( !is_same<PrimitiveNode, TrianglePacketNode>::value )
11671165
{
11681166
const bool leaf = isLeafNode( childIndex );
1169-
const uint64_t activeSubmask = __ballot( active && leaf ) & subwarpMask;
1167+
const uint64_t activeSubmask = hiprt::ballot( active && leaf ) & subwarpMask;
11701168
const uint32_t rangeSize = __popcll( activeSubmask );
11711169
const uint32_t rangeAddr = warpOffset( active && leaf, &header->m_referenceCount );
11721170
if ( active && leaf ) referenceIndices[rangeAddr] = childIndex;
@@ -1213,9 +1211,9 @@ __device__ void Collapse(
12131211
}
12141212
}
12151213

1216-
const uint64_t activeSubmask = __ballot( active && fatLeaf ) & subwarpMask;
1214+
const uint64_t activeSubmask = hiprt::ballot( active && fatLeaf ) & subwarpMask;
12171215
const uint32_t lastActiveLane = activeSubmask == 0 ? 0 : ( WarpSize - 1 ) - __clzll( activeSubmask );
1218-
const uint32_t lastRangeOffset = __shfl( rangeOffset, lastActiveLane );
1216+
const uint32_t lastRangeOffset = shfl( rangeOffset, lastActiveLane );
12191217
if ( valid && sublaneIndex == 0 && activeSubmask != 0 )
12201218
{
12211219
const uint32_t rangeSize = lastRangeOffset - rangeBase;
@@ -1772,7 +1770,7 @@ __device__ void PackLeavesWarp(
17721770
uint32_t primNodeCount = 0;
17731771

17741772
const uint32_t rangeBase = rangeOffset;
1775-
while ( __ballot( taskIndex < taskCount && rangeOffset < rangeBase + rangeSize ) )
1773+
while ( hiprt::ballot( taskIndex < taskCount && rangeOffset < rangeBase + rangeSize ) )
17761774
{
17771775
TrianglePacketData packet{};
17781776

@@ -1818,7 +1816,7 @@ __device__ void PackLeavesWarp(
18181816
}
18191817

18201818
const uint32_t newVertMask =
1821-
( __ballot( !contains && valid ) >> ( LanesPerLeafPacketTask * subwarpVertIndex ) ) & 0xf;
1819+
( hiprt::ballot( !contains && valid ) >> ( LanesPerLeafPacketTask * subwarpVertIndex ) ) & 0xf;
18221820
const uint32_t oldVertCount = packet.m_vertCount;
18231821
const uint32_t newVertCount = __popc( newVertMask );
18241822

@@ -1855,7 +1853,7 @@ __device__ void PackLeavesWarp(
18551853
rangeOffset = rangeBase;
18561854
sync_warp();
18571855

1858-
while ( __ballot( taskIndex < taskCount && rangeOffset < rangeBase + rangeSize ) )
1856+
while ( hiprt::ballot( taskIndex < taskCount && rangeOffset < rangeBase + rangeSize ) )
18591857
{
18601858
sync_warp();
18611859
TrianglePacketData packet{};
@@ -1904,7 +1902,7 @@ __device__ void PackLeavesWarp(
19041902
}
19051903
}
19061904

1907-
const uint32_t newVertMask = ( __ballot( !contains && valid ) >> ( 4 * subwarpVertIndex ) ) & 0xf;
1905+
const uint32_t newVertMask = ( hiprt::ballot( !contains && valid ) >> ( 4 * subwarpVertIndex ) ) & 0xf;
19081906
const uint32_t oldVertCount = packet.m_vertCount;
19091907
const uint32_t newVertCount = __popc( newVertMask );
19101908

@@ -1932,10 +1930,10 @@ __device__ void PackLeavesWarp(
19321930

19331931
// shuffle vertex indices in packet
19341932
uint4 vertexIndicesInPacket;
1935-
vertexIndicesInPacket.x = __shfl( vertexIndexInPacket, subwarpVertIndex * LanesPerLeafPacketTask + 0 );
1936-
vertexIndicesInPacket.y = __shfl( vertexIndexInPacket, subwarpVertIndex * LanesPerLeafPacketTask + 1 );
1937-
vertexIndicesInPacket.z = __shfl( vertexIndexInPacket, subwarpVertIndex * LanesPerLeafPacketTask + 2 );
1938-
vertexIndicesInPacket.w = __shfl( vertexIndexInPacket, subwarpVertIndex * LanesPerLeafPacketTask + 3 );
1933+
vertexIndicesInPacket.x = shfl( vertexIndexInPacket, subwarpVertIndex * LanesPerLeafPacketTask + 0 );
1934+
vertexIndicesInPacket.y = shfl( vertexIndexInPacket, subwarpVertIndex * LanesPerLeafPacketTask + 1 );
1935+
vertexIndicesInPacket.z = shfl( vertexIndexInPacket, subwarpVertIndex * LanesPerLeafPacketTask + 2 );
1936+
vertexIndicesInPacket.w = shfl( vertexIndexInPacket, subwarpVertIndex * LanesPerLeafPacketTask + 3 );
19391937

19401938
uint3 triIndices0 = make_uint3( vertexIndicesInPacket );
19411939
uint3 triIndices1{};
@@ -1957,7 +1955,7 @@ __device__ void PackLeavesWarp(
19571955
__threadfence_block();
19581956

19591957
// build packets
1960-
uint64_t packetMask = __ballot( taskIndex < taskCount && packet.m_triPairCount > 0 && sublaneIndex == 0 );
1958+
uint64_t packetMask = hiprt::ballot( taskIndex < taskCount && packet.m_triPairCount > 0 && sublaneIndex == 0 );
19611959
while ( packetMask )
19621960
{
19631961
const uint32_t halfWarpIndex = laneIndex / 16;
@@ -1972,7 +1970,7 @@ __device__ void PackLeavesWarp(
19721970

19731971
const uint32_t broadcastLane = ( halfWarpIndex == 0 ) ? broadcastLane0 : broadcastLane1;
19741972
const uint32_t broadcastSubwarpIndex = broadcastLane / LanesPerLeafPacketTask;
1975-
const uint32_t broadcastPrimNodeOffset = __shfl( primNodeOffset, broadcastLane );
1973+
const uint32_t broadcastPrimNodeOffset = shfl( primNodeOffset, broadcastLane );
19761974
const TrianglePacketData broadcastPacket = packet.shuffle( broadcastLane );
19771975

19781976
// store current packet data to registers

0 commit comments

Comments
 (0)