@@ -137,7 +137,7 @@ HIPRT_DEVICE HIPRT_INLINE void openNodes(
137137 << static_cast <uint64_t >( ( BranchingFactor * subwarpIndex ) );
138138
139139 bool done = childCount == BranchingFactor;
140- while ( __ballot ( !done ) )
140+ while ( hiprt::ballot ( !done ) )
141141 {
142142 sync_warp ();
143143
@@ -153,13 +153,13 @@ HIPRT_DEVICE HIPRT_INLINE void openNodes(
153153 float maxArea = area;
154154#pragma unroll
155155 for ( uint32_t i = 1 ; i < BranchingFactor; i <<= 1 )
156- maxArea = hiprt::max ( maxArea, __shfl_xor ( maxArea, i ) );
156+ maxArea = hiprt::max ( maxArea, shfl_xor ( maxArea, i ) );
157157 if ( maxArea < 0 .0f ) done = true ;
158158
159159 const uint32_t maxLaneIndex =
160- __ffsll ( static_cast <unsigned long long >( __ballot ( maxArea == area ) ) & subwarpMask ) - 1 ;
160+ __ffsll ( static_cast <unsigned long long >( hiprt::ballot ( maxArea == area ) ) & subwarpMask ) - 1 ;
161161 const uint32_t maxIndex = maxLaneIndex % BranchingFactor;
162- const uint32_t maxChildIndex = __shfl ( childIndex, maxLaneIndex );
162+ const uint32_t maxChildIndex = shfl ( childIndex, maxLaneIndex );
163163
164164 if ( !done )
165165 {
@@ -366,37 +366,35 @@ extern "C" __global__ void PairTriangles( TriangleMesh mesh, uint2* pairIndices,
366366
367367 bool valid = index < mesh.getCount ();
368368 uint32_t pairedIndex = InvalidValue;
369- uint64_t activeMask = __ballot ( valid );
369+ uint64_t activeMask = hiprt::ballot ( valid );
370370
371371 uint3 triIndices;
372372 if ( valid ) triIndices = mesh.fetchTriangleIndices ( index );
373373
374374 while ( activeMask )
375375 {
376- activeMask = __shfl ( activeMask, 0 );
376+ activeMask = shfl ( activeMask, 0 );
377377
378378 const uint64_t broadcastLane = __ffsll ( static_cast <unsigned long long >( activeMask ) ) - 1 ;
379379 if ( laneIndex == broadcastLane ) valid = false ;
380380
381381 activeMask &= activeMask - 1 ;
382382
383- const uint32_t broadcastIndex = __shfl ( index, broadcastLane );
383+ const uint32_t broadcastIndex = shfl ( index, broadcastLane );
384384 const uint3 triIndicesBroadcast = {
385- __shfl ( triIndices.x , broadcastLane ),
386- __shfl ( triIndices.y , broadcastLane ),
387- __shfl ( triIndices.z , broadcastLane ) };
385+ shfl ( triIndices.x , broadcastLane ), shfl ( triIndices.y , broadcastLane ), shfl ( triIndices.z , broadcastLane ) };
388386
389387 bool pairable = false ;
390388 if ( index != broadcastIndex && valid )
391389 pairable = tryPairTriangles ( triIndicesBroadcast, triIndices ).x != InvalidValue;
392390
393- const uint32_t firstPairedLane = __ffsll ( static_cast <unsigned long long >( __ballot ( pairable ) ) ) - 1 ;
391+ const uint32_t firstPairedLane = __ffsll ( static_cast <unsigned long long >( hiprt::ballot ( pairable ) ) ) - 1 ;
394392 if ( firstPairedLane < WarpSize )
395393 {
396394 activeMask &= ~( 1u << firstPairedLane );
397395 if ( laneIndex == firstPairedLane ) valid = false ;
398396
399- const uint32_t secondIndex = __shfl ( index, firstPairedLane );
397+ const uint32_t secondIndex = shfl ( index, firstPairedLane );
400398 if ( laneIndex == broadcastLane ) pairedIndex = secondIndex;
401399 }
402400 else if ( laneIndex == broadcastLane )
@@ -714,10 +712,10 @@ __device__ void FitBounds( Header* header, PrimitiveContainer& primitives, BoxNo
714712 internal = laneIndex < childCount && node.getChildType ( laneIndex ) == BoxType;
715713 }
716714
717- uint32_t internalCount = __popcll ( __ballot ( internal ) & subwarpMask );
715+ uint32_t internalCount = __popcll ( hiprt::ballot ( internal ) & subwarpMask );
718716 if ( internalCount > 0 ) done = true ;
719717
720- while ( __any ( !done ) )
718+ while ( hiprt::any ( !done ) )
721719 {
722720 __threadfence ();
723721
@@ -755,13 +753,13 @@ __device__ void FitBounds( Header* header, PrimitiveContainer& primitives, BoxNo
755753 internal = laneIndex < childCount && node.getChildType ( laneIndex ) == BoxType;
756754 }
757755
758- internalCount = __popcll ( __ballot ( internal ) & subwarpMask );
756+ internalCount = __popcll ( hiprt::ballot ( internal ) & subwarpMask );
759757
760758 __threadfence ();
761759
762760 if ( !done && sublaneIndex == 0 && atomicAdd ( &boxNodes[index].m_updateCounter , 1 ) < internalCount - 1 ) done = true ;
763761
764- done = __shfl ( done, subwarpIndex * BranchingFactor );
762+ done = shfl ( done, subwarpIndex * BranchingFactor );
765763 }
766764}
767765
@@ -827,11 +825,11 @@ __device__ void FitOrientedBounds(
827825 uint32_t childCount = node.getChildCount ();
828826 bool internal = laneIndex < childCount && node.getChildType ( laneIndex ) == BoxType;
829827
830- uint32_t internalCount = __popcll ( __ballot ( internal ) );
828+ uint32_t internalCount = __popcll ( hiprt::ballot ( internal ) );
831829
832830 bool done = internalCount > 0 ;
833831
834- while ( __any ( !done ) )
832+ while ( hiprt::any ( !done ) )
835833 {
836834 __threadfence ();
837835
@@ -858,9 +856,9 @@ __device__ void FitOrientedBounds(
858856 }
859857 }
860858
861- const float minArea = warpMin ( minAreaLane );
862- const uint32_t minIndex = __ffsll ( static_cast <unsigned long long >( __ballot ( minAreaLane == minArea ) ) ) - 1 ;
863- const uint32_t matrixIndex = __shfl ( minIndexLane, minIndex );
859+ const float minArea = warpMin ( minAreaLane );
860+ const uint32_t minIndex = __ffsll ( static_cast <unsigned long long >( hiprt::ballot ( minAreaLane == minArea ) ) ) - 1 ;
861+ const uint32_t matrixIndex = shfl ( minIndexLane, minIndex );
864862
865863 Aabb childBox;
866864 uint32_t childIndex;
@@ -889,13 +887,13 @@ __device__ void FitOrientedBounds(
889887 childCount = node.getChildCount ();
890888 internal = laneIndex < childCount && node.getChildType ( laneIndex ) == BoxType;
891889
892- internalCount = __popcll ( __ballot ( internal ) );
890+ internalCount = __popcll ( hiprt::ballot ( internal ) );
893891
894892 __threadfence ();
895893
896894 if ( laneIndex == 0 && atomicAdd ( &updateCounters[index], 1 ) < internalCount - 1 ) done = true ;
897895
898- done = __shfl ( done, 0 );
896+ done = shfl ( done, 0 );
899897 }
900898}
901899
@@ -1083,7 +1081,7 @@ __device__ void Collapse(
10831081
10841082 bool done = taskIndex >= maxBoxNodeCount || taskIndex >= referenceCount;
10851083
1086- while ( __any ( !done ) )
1084+ while ( hiprt::any ( !done ) )
10871085 {
10881086 sync_warp ();
10891087 __threadfence ();
@@ -1106,7 +1104,7 @@ __device__ void Collapse(
11061104 const bool valid = nodeIndex != InvalidValue && nodeAddr != InvalidValue && parentAddr != InvalidValue;
11071105
11081106 // skip inactive warps
1109- if ( __all ( !valid ) ) continue ;
1107+ if ( hiprt::all ( !valid ) ) continue ;
11101108
11111109 Aabb childBox;
11121110 uint32_t childIndex = InvalidValue;
@@ -1115,8 +1113,8 @@ __device__ void Collapse(
11151113 if ( nodeAddr == 0 ) parentAddr = InvalidValue;
11161114
11171115 // fill inactive lanes with first valid node index
1118- const uint32_t firstValidLane = __ffsll ( __ballot ( valid ) ) - 1 ;
1119- nodeIndex = __shfl ( nodeIndex, valid ? laneIndex : firstValidLane );
1116+ const uint32_t firstValidLane = __ffsll ( static_cast < unsigned long long >( hiprt::ballot ( valid ) ) ) - 1 ;
1117+ nodeIndex = shfl ( nodeIndex, valid ? laneIndex : firstValidLane );
11201118
11211119 BinaryNode binaryNode = binaryNodes[getNodeAddr ( nodeIndex )];
11221120 if ( sublaneIndex < 2 )
@@ -1136,7 +1134,7 @@ __device__ void Collapse(
11361134
11371135 const bool internal = isInternalNode ( childIndex ) && !isFatLeafNode ( childIndex );
11381136 const uint32_t childAddr = warpOffset ( active && internal, &header->m_boxNodeCount );
1139- const uint32_t internalBase = __shfl ( childAddr, subwarpIndex * BranchingFactor );
1137+ const uint32_t internalBase = shfl ( childAddr, subwarpIndex * BranchingFactor );
11401138 if ( active && internal )
11411139 {
11421140 atomic_store ( &taskQueue[childAddr], { childIndex, childAddr, nodeAddr } );
@@ -1166,7 +1164,7 @@ __device__ void Collapse(
11661164 if constexpr ( !is_same<PrimitiveNode, TrianglePacketNode>::value )
11671165 {
11681166 const bool leaf = isLeafNode ( childIndex );
1169- const uint64_t activeSubmask = __ballot ( active && leaf ) & subwarpMask;
1167+ const uint64_t activeSubmask = hiprt::ballot ( active && leaf ) & subwarpMask;
11701168 const uint32_t rangeSize = __popcll ( activeSubmask );
11711169 const uint32_t rangeAddr = warpOffset ( active && leaf, &header->m_referenceCount );
11721170 if ( active && leaf ) referenceIndices[rangeAddr] = childIndex;
@@ -1213,9 +1211,9 @@ __device__ void Collapse(
12131211 }
12141212 }
12151213
1216- const uint64_t activeSubmask = __ballot ( active && fatLeaf ) & subwarpMask;
1214+ const uint64_t activeSubmask = hiprt::ballot ( active && fatLeaf ) & subwarpMask;
12171215 const uint32_t lastActiveLane = activeSubmask == 0 ? 0 : ( WarpSize - 1 ) - __clzll ( activeSubmask );
1218- const uint32_t lastRangeOffset = __shfl ( rangeOffset, lastActiveLane );
1216+ const uint32_t lastRangeOffset = shfl ( rangeOffset, lastActiveLane );
12191217 if ( valid && sublaneIndex == 0 && activeSubmask != 0 )
12201218 {
12211219 const uint32_t rangeSize = lastRangeOffset - rangeBase;
@@ -1772,7 +1770,7 @@ __device__ void PackLeavesWarp(
17721770 uint32_t primNodeCount = 0 ;
17731771
17741772 const uint32_t rangeBase = rangeOffset;
1775- while ( __ballot ( taskIndex < taskCount && rangeOffset < rangeBase + rangeSize ) )
1773+ while ( hiprt::ballot ( taskIndex < taskCount && rangeOffset < rangeBase + rangeSize ) )
17761774 {
17771775 TrianglePacketData packet{};
17781776
@@ -1818,7 +1816,7 @@ __device__ void PackLeavesWarp(
18181816 }
18191817
18201818 const uint32_t newVertMask =
1821- ( __ballot ( !contains && valid ) >> ( LanesPerLeafPacketTask * subwarpVertIndex ) ) & 0xf ;
1819+ ( hiprt::ballot ( !contains && valid ) >> ( LanesPerLeafPacketTask * subwarpVertIndex ) ) & 0xf ;
18221820 const uint32_t oldVertCount = packet.m_vertCount ;
18231821 const uint32_t newVertCount = __popc ( newVertMask );
18241822
@@ -1855,7 +1853,7 @@ __device__ void PackLeavesWarp(
18551853 rangeOffset = rangeBase;
18561854 sync_warp ();
18571855
1858- while ( __ballot ( taskIndex < taskCount && rangeOffset < rangeBase + rangeSize ) )
1856+ while ( hiprt::ballot ( taskIndex < taskCount && rangeOffset < rangeBase + rangeSize ) )
18591857 {
18601858 sync_warp ();
18611859 TrianglePacketData packet{};
@@ -1904,7 +1902,7 @@ __device__ void PackLeavesWarp(
19041902 }
19051903 }
19061904
1907- const uint32_t newVertMask = ( __ballot ( !contains && valid ) >> ( 4 * subwarpVertIndex ) ) & 0xf ;
1905+ const uint32_t newVertMask = ( hiprt::ballot ( !contains && valid ) >> ( 4 * subwarpVertIndex ) ) & 0xf ;
19081906 const uint32_t oldVertCount = packet.m_vertCount ;
19091907 const uint32_t newVertCount = __popc ( newVertMask );
19101908
@@ -1932,10 +1930,10 @@ __device__ void PackLeavesWarp(
19321930
19331931 // shuffle vertex indices in packet
19341932 uint4 vertexIndicesInPacket;
1935- vertexIndicesInPacket.x = __shfl ( vertexIndexInPacket, subwarpVertIndex * LanesPerLeafPacketTask + 0 );
1936- vertexIndicesInPacket.y = __shfl ( vertexIndexInPacket, subwarpVertIndex * LanesPerLeafPacketTask + 1 );
1937- vertexIndicesInPacket.z = __shfl ( vertexIndexInPacket, subwarpVertIndex * LanesPerLeafPacketTask + 2 );
1938- vertexIndicesInPacket.w = __shfl ( vertexIndexInPacket, subwarpVertIndex * LanesPerLeafPacketTask + 3 );
1933+ vertexIndicesInPacket.x = shfl ( vertexIndexInPacket, subwarpVertIndex * LanesPerLeafPacketTask + 0 );
1934+ vertexIndicesInPacket.y = shfl ( vertexIndexInPacket, subwarpVertIndex * LanesPerLeafPacketTask + 1 );
1935+ vertexIndicesInPacket.z = shfl ( vertexIndexInPacket, subwarpVertIndex * LanesPerLeafPacketTask + 2 );
1936+ vertexIndicesInPacket.w = shfl ( vertexIndexInPacket, subwarpVertIndex * LanesPerLeafPacketTask + 3 );
19391937
19401938 uint3 triIndices0 = make_uint3 ( vertexIndicesInPacket );
19411939 uint3 triIndices1{};
@@ -1957,7 +1955,7 @@ __device__ void PackLeavesWarp(
19571955 __threadfence_block ();
19581956
19591957 // build packets
1960- uint64_t packetMask = __ballot ( taskIndex < taskCount && packet.m_triPairCount > 0 && sublaneIndex == 0 );
1958+ uint64_t packetMask = hiprt::ballot ( taskIndex < taskCount && packet.m_triPairCount > 0 && sublaneIndex == 0 );
19611959 while ( packetMask )
19621960 {
19631961 const uint32_t halfWarpIndex = laneIndex / 16 ;
@@ -1972,7 +1970,7 @@ __device__ void PackLeavesWarp(
19721970
19731971 const uint32_t broadcastLane = ( halfWarpIndex == 0 ) ? broadcastLane0 : broadcastLane1;
19741972 const uint32_t broadcastSubwarpIndex = broadcastLane / LanesPerLeafPacketTask;
1975- const uint32_t broadcastPrimNodeOffset = __shfl ( primNodeOffset, broadcastLane );
1973+ const uint32_t broadcastPrimNodeOffset = shfl ( primNodeOffset, broadcastLane );
19761974 const TrianglePacketData broadcastPacket = packet.shuffle ( broadcastLane );
19771975
19781976 // store current packet data to registers
0 commit comments