@@ -114,11 +114,19 @@ HIPRT_DEVICE HIPRT_INLINE Obb getNodeObb(
114114 if ( nodeType != BoxType )
115115 {
116116 if constexpr ( is_same<PrimitiveNode, TrianglePacketNode>::value )
117- return primNodes[nodeAddr].obb ( typeToTriPairIndex ( nodeType ), matrixIndex, nodeBox );
117+ {
118+ Obb obb = primNodes[nodeAddr].obb ( typeToTriPairIndex ( nodeType ), matrixIndex, nodeBox );
119+ if ( !obb.aabb ().valid () ) obb = Obb ( matrixIndex ).grow ( nodeBox );
120+ return obb;
121+ }
118122 else if constexpr ( is_same<PrimitiveNode, InstanceNode>::value )
123+ {
119124 return primitives.fetchObb ( primNodes[nodeAddr].m_primIndex , matrixIndex, nodeBox );
125+ }
120126 else
127+ {
121128 return Obb ( matrixIndex ).grow ( nodeBox );
129+ }
122130 }
123131 else
124132 {
@@ -1467,6 +1475,7 @@ __device__ void PackLeaves(
14671475 }
14681476}
14691477
1478+ // assuming that triangle packets are AMD specific, there are no warp syncs
14701479template <>
14711480__device__ void PackLeaves<TriangleMesh, TrianglePacketNode, GeomHeader>(
14721481 uint32_t index,
@@ -1754,6 +1763,7 @@ __device__ void PackLeaves<TriangleMesh, TrianglePacketNode, GeomHeader>(
17541763 }
17551764}
17561765
1766+ // assuming that triangle packets are AMD specific, there are no warp syncs
17571767__device__ void PackLeavesWarp (
17581768 uint32_t index,
17591769 uint32_t taskCount,
@@ -1813,15 +1823,13 @@ __device__ void PackLeavesWarp(
18131823 }
18141824
18151825 // find new vertices
1816- const uint32_t sublaneVertIndex = laneIndex % LanesPerLeafPacketTask;
1817- const uint32_t subwarpVertIndex = laneIndex / LanesPerLeafPacketTask;
1818- const bool valid = sublaneVertIndex < 3 || pairIndices.x != pairIndices.y ;
1826+ const bool valid = sublaneIndex < 3 || pairIndices.x != pairIndices.y ;
18191827
18201828 bool contains = false ;
18211829 uint32_t vertexIndexInPacket{};
18221830 for ( uint32_t k = 0 ; k < packet.m_vertCount ; ++k )
18231831 {
1824- if ( triPacketCache[subwarpIndex].m_vertexIndices [k] == ( &indices.x )[sublaneVertIndex ] )
1832+ if ( triPacketCache[subwarpIndex].m_vertexIndices [k] == ( &indices.x )[sublaneIndex ] )
18251833 {
18261834 vertexIndexInPacket = k;
18271835 contains = true ;
@@ -1830,7 +1838,7 @@ __device__ void PackLeavesWarp(
18301838 }
18311839
18321840 const uint32_t newVertMask =
1833- ( hiprt::ballot ( !contains && valid ) >> ( LanesPerLeafPacketTask * subwarpVertIndex ) ) & 0xf ;
1841+ ( hiprt::ballot ( !contains && valid ) >> ( LanesPerLeafPacketTask * subwarpIndex ) ) & 0xf ;
18341842 const uint32_t oldVertCount = packet.m_vertCount ;
18351843 const uint32_t newVertCount = __popc ( newVertMask );
18361844
@@ -1840,21 +1848,21 @@ __device__ void PackLeavesWarp(
18401848 // store new vertices
18411849 if ( !contains )
18421850 {
1843- const uint32_t vertexMask = ( 1 << sublaneVertIndex ) - 1 ;
1851+ const uint32_t vertexMask = ( 1 << sublaneIndex ) - 1 ;
18441852 vertexIndexInPacket = oldVertCount + __popc ( newVertMask & vertexMask );
1845- triPacketCache[subwarpIndex].m_vertexIndices [vertexIndexInPacket] = ( &indices.x )[sublaneVertIndex ];
1853+ triPacketCache[subwarpIndex].m_vertexIndices [vertexIndexInPacket] = ( &indices.x )[sublaneIndex ];
18461854 }
18471855
18481856 rangeOffset++;
1857+
1858+ // not sure why but this fence is needed on linux
1859+ __threadfence_block ();
18491860 }
1850- sync_warp ();
18511861 __threadfence_block ();
18521862
18531863 // count packets
18541864 if ( taskIndex < taskCount && packet.m_triPairCount > 0 ) primNodeCount++;
1855- sync_warp ();
18561865 }
1857- sync_warp ();
18581866
18591867 const uint32_t primNodeBase =
18601868 warpOffset ( sublaneIndex == LanesPerLeafPacketTask - 1 ? primNodeCount : 0u , &header->m_primNodeCount );
@@ -1865,11 +1873,9 @@ __device__ void PackLeavesWarp(
18651873 uint32_t leafIndex = 0 ;
18661874
18671875 rangeOffset = rangeBase;
1868- sync_warp ();
18691876
18701877 while ( hiprt::ballot ( taskIndex < taskCount && rangeOffset < rangeBase + rangeSize ) )
18711878 {
1872- sync_warp ();
18731879 TrianglePacketData packet{};
18741880
18751881 while ( rangeOffset < rangeBase + rangeSize && packet.m_triPairCount < MaxTrianglePairsPerTrianglePacket )
@@ -1900,23 +1906,22 @@ __device__ void PackLeavesWarp(
19001906 }
19011907
19021908 // find new vertices
1903- const uint32_t sublaneVertIndex = laneIndex % LanesPerLeafPacketTask;
1904- const uint32_t subwarpVertIndex = laneIndex / LanesPerLeafPacketTask;
1905- const bool valid = sublaneVertIndex < 3 || pairIndices.x != pairIndices.y ;
1909+ const bool valid = sublaneIndex < 3 || pairIndices.x != pairIndices.y ;
19061910
19071911 bool contains = false ;
19081912 uint32_t vertexIndexInPacket{};
19091913 for ( uint32_t k = 0 ; k < packet.m_vertCount ; ++k )
19101914 {
1911- if ( triPacketCache[subwarpIndex].m_vertexIndices [k] == ( &indices.x )[sublaneVertIndex ] )
1915+ if ( triPacketCache[subwarpIndex].m_vertexIndices [k] == ( &indices.x )[sublaneIndex ] )
19121916 {
19131917 vertexIndexInPacket = k;
19141918 contains = true ;
19151919 break ;
19161920 }
19171921 }
19181922
1919- const uint32_t newVertMask = ( hiprt::ballot ( !contains && valid ) >> ( 4 * subwarpVertIndex ) ) & 0xf ;
1923+ const uint32_t newVertMask =
1924+ ( hiprt::ballot ( !contains && valid ) >> ( LanesPerLeafPacketTask * subwarpIndex ) ) & 0xf ;
19201925 const uint32_t oldVertCount = packet.m_vertCount ;
19211926 const uint32_t newVertCount = __popc ( newVertMask );
19221927
@@ -1937,17 +1942,17 @@ __device__ void PackLeavesWarp(
19371942 // store new vertices
19381943 if ( !contains )
19391944 {
1940- const uint32_t vertexMask = ( 1 << sublaneVertIndex ) - 1 ;
1945+ const uint32_t vertexMask = ( 1 << sublaneIndex ) - 1 ;
19411946 vertexIndexInPacket = oldVertCount + __popc ( newVertMask & vertexMask );
1942- triPacketCache[subwarpIndex].m_vertexIndices [vertexIndexInPacket] = ( &indices.x )[sublaneVertIndex ];
1947+ triPacketCache[subwarpIndex].m_vertexIndices [vertexIndexInPacket] = ( &indices.x )[sublaneIndex ];
19431948 }
19441949
19451950 // shuffle vertex indices in packet
19461951 uint4 vertexIndicesInPacket;
1947- vertexIndicesInPacket.x = shfl ( vertexIndexInPacket, subwarpVertIndex * LanesPerLeafPacketTask + 0 );
1948- vertexIndicesInPacket.y = shfl ( vertexIndexInPacket, subwarpVertIndex * LanesPerLeafPacketTask + 1 );
1949- vertexIndicesInPacket.z = shfl ( vertexIndexInPacket, subwarpVertIndex * LanesPerLeafPacketTask + 2 );
1950- vertexIndicesInPacket.w = shfl ( vertexIndexInPacket, subwarpVertIndex * LanesPerLeafPacketTask + 3 );
1952+ vertexIndicesInPacket.x = shfl ( vertexIndexInPacket, subwarpIndex * LanesPerLeafPacketTask + 0 );
1953+ vertexIndicesInPacket.y = shfl ( vertexIndexInPacket, subwarpIndex * LanesPerLeafPacketTask + 1 );
1954+ vertexIndicesInPacket.z = shfl ( vertexIndexInPacket, subwarpIndex * LanesPerLeafPacketTask + 2 );
1955+ vertexIndicesInPacket.w = shfl ( vertexIndexInPacket, subwarpIndex * LanesPerLeafPacketTask + 3 );
19511956
19521957 uint3 triIndices0 = make_uint3 ( vertexIndicesInPacket );
19531958 uint3 triIndices1{};
@@ -1968,7 +1973,6 @@ __device__ void PackLeavesWarp(
19681973 // otherwise triIndices1 are not correcly written to the final packet
19691974 __threadfence_block ();
19701975 }
1971- sync_warp ();
19721976 __threadfence_block ();
19731977
19741978 // build packets
@@ -2000,7 +2004,6 @@ __device__ void PackLeavesWarp(
20002004 if ( halfLaneIndex < broadcastPacket.m_vertCount )
20012005 halfLaneVertexIndex = triPacketCache[broadcastSubwarpIndex].m_vertexIndices [halfLaneIndex];
20022006 }
2003- sync_warp ();
20042007
20052008 // reuse shared memory
20062009 TrianglePacketNode& triPacketNode =
@@ -2010,7 +2013,6 @@ __device__ void PackLeavesWarp(
20102013 triPacketNode.m_data [halfLaneIndex + 0 * 16 ] = 0 ;
20112014 triPacketNode.m_data [halfLaneIndex + 1 * 16 ] = 0 ;
20122015 }
2013- sync_warp ();
20142016
20152017 // build two packets at once
20162018 if ( halfWarpIndex == 0 || secondValid )
@@ -2041,7 +2043,6 @@ __device__ void PackLeavesWarp(
20412043 triPacketNode.writeVertex <true >( halfLaneIndex, vertex );
20422044 }
20432045 }
2044- sync_warp ();
20452046
20462047 // write packet
20472048 if ( ( halfWarpIndex == 0 || secondValid ) )
@@ -2051,13 +2052,10 @@ __device__ void PackLeavesWarp(
20512052 primNodes[broadcastPrimNodeOffset].m_data [halfLaneIndex + 1 * 16 ] =
20522053 triPacketNode.m_data [halfLaneIndex + 1 * 16 ];
20532054 }
2054- sync_warp ();
20552055 }
2056- sync_warp ();
20572056
20582057 if ( taskIndex < taskCount && packet.m_triPairCount > 0 ) primNodeOffset++;
20592058 }
2060- sync_warp ();
20612059
20622060 // patch children
20632061 if ( taskIndex < taskCount )
0 commit comments