GPUOpen-LibrariesAndSDKs
diff --git a/‎CHANGELOG.md‎
Lines changed: 53 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 3 additions & 3 deletions b/‎README.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎hiprt/hiprt_common.h‎
Lines changed: 7 additions & 4 deletions b/‎hiprt/hiprt_common.h‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎hiprt/hiprt_libpath.h‎
Lines changed: 2 additions & 2 deletions b/‎hiprt/hiprt_libpath.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎hiprt/impl/BatchBuilder.cpp‎
Lines changed: 8 additions & 8 deletions b/‎hiprt/impl/BatchBuilder.cpp‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎hiprt/impl/BatchBuilderKernels.h‎
Lines changed: 22 additions & 39 deletions b/‎hiprt/impl/BatchBuilderKernels.h‎
Lines changed: 22 additions & 39 deletions
@@ -0,0 +1,53 @@
+2.4.c587aa7
+- H-PLOC and improved wide BVH conversion
+- CMake support
+- Configurable HIPRT path via a env. variable
+- New gfx architectures supported 
+- hiprtBuildTraceKernel can return only the HIP module
+- HIP module caching and unloading (fixing a memory leak)
+- Fixing matrix inversion and identity check
+- Fixing refit and other minor issues
+
+2.3.7df94af
+- Transformation query API changed/extended
+
+2.2.0e68f54 (December 2023)
+- Multi-level instancing
+- Triangle pairing
+- AS Compaction
+- Optimized BVH build speed
+
+2.1.c202dac (November 2023)
+- HIPRT binaries compiled with ROCm 5.7
+- A fix for caching trace kernels
+- A fix for the custom function table compilation
+- A fix for the fast and balanced builders with custom streams
+
+2.1.6fc8ff0 (September 2023)
+- Dynamic traversal stack assignment
+- Batch BVH construction
+- Transformation query functions
+- Improved BVH construction speed
+- Improved RT speed for transformed instances
+- Fixed geometry IO API
+- Optional trace kernel caching
+
+2.0.3a134c7 (May 2023)
+- BVH memory optimization
+- SBVH speed optimization
+- Fixing hiprtBuildTraceKernels
+- Dynamic loading via HIPRTEW
+- Traversal optimization
+
+2.0.0 (February 2023)
+- Bitcode and precompilation (527.41 or newer driver is necessary to run on NVIDIA® on Windows®)
+- Performance improvement
+- Navi3x support
+- MI60 and MI200 support
+- Traversal hints for better performance
+- Concurrent build via streams
+- Custom function table
+- Intersection filter
+- Transformation matrices support
+- Multiple templated kernels
+- Added ray t min
@@ -31,12 +31,12 @@ Then, you can use either premake or cmake.
 
 &nbsp;&nbsp;&nbsp;Example with Cmake on Windows:  
 &nbsp;&nbsp;&nbsp;5. `mkdir build`  
-&nbsp;&nbsp;&nbsp;6. `cmake -DCMAKE_BUILD_TYPE=Release -DBITCODE=OFF -S . -B build`  
+&nbsp;&nbsp;&nbsp;6. `cmake -DCMAKE_BUILD_TYPE=Release -DBITCODE=OFF -DHIP_PATH="C:\Program Files\AMD\ROCm\5.7" -S . -B build`  
 &nbsp;&nbsp;&nbsp;7. `Open build\hiprt.sln with Visual Studio 2022.`  
 
 &nbsp;&nbsp;&nbsp;Example with Cmake on Linux:  
 &nbsp;&nbsp;&nbsp;5. `mkdir build`  
-&nbsp;&nbsp;&nbsp;6. `cmake -DCMAKE_BUILD_TYPE=Release -DBITCODE=OFF -S . -B build`  
+&nbsp;&nbsp;&nbsp;6. `cmake -DCMAKE_BUILD_TYPE=Release -DBITCODE=OFF -DHIP_PATH="/opt/rocm" -S . -B build`  
 &nbsp;&nbsp;&nbsp;7. `cmake --build build --config Release`  
 
 
@@ -47,7 +47,7 @@ Add the option `--bitcode` in premake, or `-DBITCODE=ON` in cmake to enable prec
 
 #### Generation of bitcode
 - After premake, go to `scripts/bitcodes`, then run `python compile.py` which compiles kernels to bitcode and fatbinary.
-- Or pass `--precompile` to premake. it executes the `compile.py` during premake. Note that you cannot do it in git bash on windows (because of hipcc...)
+- Or pass `--precompile` to premake, or `-DPRECOMPILE=ON` in cmake . It executes the `compile.py` during premake. Note that you cannot do it in git bash on windows (because of hipcc...)
 
 
 ## Running Unit Tests
 
@@ -37,7 +37,6 @@
 #include <cfloat>
 #include <cstring>
 #include <cmath>
-#include <cstdint>
 #include <map>
 #include <string>
 #include <vector>
@@ -48,6 +47,10 @@
 #define __device__
 #endif
 
+#if !defined( __KERNELCC_RTC__ )
+#include <cstdint>
+#endif
+
 #ifdef __CUDACC__
 // Switch to sync counterparts as CUDA recently deprecated the non-sync ones
 #define __shfl( x, y ) __shfl_sync( __activemask(), ( x ), ( y ) )
@@ -92,7 +95,6 @@
 #define HIPRT_DEVICE __device__
 #define HIPRT_HOST_DEVICE __host__ __device__
 
-// TODO: cleanup after baking is removed
 #if defined( HIPRT_BAKE_KERNEL_GENERATED )
 #define GET_ARGS( X ) ( hip::X##Args )
 #define GET_INC( X ) ( hip::X##Includes )
@@ -173,6 +175,7 @@ constexpr uint32_t FullRayMask				 = ~0u;
 constexpr uint32_t MaxBatchBuildMaxPrimCount = 512u;
 constexpr uint32_t MaxInstanceLevels		 = 4u;
 constexpr uint32_t BranchingFactor			 = 4u;
+constexpr uint32_t DefaultAlignment			 = 64u;
 
 #ifdef __KERNELCC__
 #if __gfx900__ || __gfx902__ || __gfx904__ || __gfx906__ || __gfx908__ || __gfx909__ || __gfx90a__ || __gfx90c__ || \
@@ -413,9 +416,9 @@ enum TraversalObjSize
 	SizePrivateInstanceStack	   = 160,
 	SizeGlobalInstanceStack		   = 48,
 	SizeGeomTraversalCustomStack   = 128,
-	SizeSceneTraversalCustomStack  = 192,
+	SizeSceneTraversalCustomStack  = 176,
 	SizeGeomTraversalPrivateStack  = 400,
-	SizeSceneTraversalPrivateStack = 624,
+	SizeSceneTraversalPrivateStack = 608,
 };
 
 enum TraversalObjAlignment
 
@@ -32,9 +32,9 @@
 #ifdef _WIN32
 
 #ifdef HIPRT_PREFER_HIP_5
-const char* g_hip_paths[]	 = { "amdhip64.dll", "amdhip64_6.dll", NULL };
+const char* g_hip_paths[] = { "amdhip64.dll", "amdhip64_6.dll", NULL };
 #else
-const char* g_hip_paths[]	 = { "amdhip64_6.dll", "amdhip64.dll", NULL };
+const char* g_hip_paths[] = { "amdhip64_6.dll", "amdhip64.dll", NULL };
 #endif
 
 const char* g_hiprtc_paths[] = {
 
@@ -34,17 +34,17 @@ DECLARE_TYPE_TRAITS( hiprtSceneBuildInput );
 
 size_t BatchBuilder::getStorageBufferSize( const hiprtGeometryBuildInput& buildInput, const hiprtBuildOptions buildOptions )
 {
-	const size_t primCount = getPrimCount( buildInput );
-	const size_t nodeSize  = getNodeSize( buildInput );
-	const size_t nodeCount = divideRoundUp( 2 * primCount, 3 );
-	return getGeometryStorageBufferSize( primCount, nodeCount, nodeSize );
+	const size_t primCount	  = getPrimCount( buildInput );
+	const size_t primNodeSize = getPrimNodeSize( buildInput );
+	const size_t boxNodeCount = divideRoundUp( 2 * primCount, 3 );
+	return getGeometryStorageBufferSize( primCount, boxNodeCount, primNodeSize );
 }
 
 size_t BatchBuilder::getStorageBufferSize( const hiprtSceneBuildInput& buildInput, const hiprtBuildOptions buildOptions )
 {
-	const size_t frameCount = buildInput.frameCount;
-	const size_t primCount	= buildInput.instanceCount;
-	const size_t nodeCount	= divideRoundUp( 2 * primCount, 3 );
-	return getSceneStorageBufferSize( primCount, nodeCount, frameCount );
+	const size_t frameCount	  = buildInput.frameCount;
+	const size_t primCount	  = buildInput.instanceCount;
+	const size_t boxNodeCount = divideRoundUp( 2 * primCount, 3 );
+	return getSceneStorageBufferSize( primCount, primCount, boxNodeCount, frameCount );
 }
 } // namespace hiprt
@@ -54,22 +54,22 @@ static constexpr size_t CacheSize = RoundUp( ( BatchBuilderMaxBlockSize - 1 ) *
 									RoundUp( ( BatchBuilderMaxBlockSize ) * sizeof( ReferenceNode ), CacheAlignment ) +
 									2 * RoundUp( BatchBuilderMaxBlockSize * sizeof( uint32_t ), CacheAlignment ) +
 									RoundUp( BatchBuilderMaxBlockSize * sizeof( uint32_t ), CacheAlignment ) +
-									RoundUp( BatchBuilderMaxBlockSize * sizeof( int2 ), CacheAlignment );
+									RoundUp( BatchBuilderMaxBlockSize * sizeof( int3 ), CacheAlignment );
 
 HIPRT_DEVICE size_t getStorageBufferSize( const hiprtGeometryBuildInput& buildInput )
 {
-	const size_t primCount = getPrimCount( buildInput );
-	const size_t nodeSize  = getNodeSize( buildInput );
-	const size_t nodeCount = divideRoundUp( 2 * primCount, 3 );
-	return getGeometryStorageBufferSize( primCount, nodeCount, nodeSize );
+	const size_t primCount	  = getPrimCount( buildInput );
+	const size_t primNodeSize = getPrimNodeSize( buildInput );
+	const size_t boxNodeCount = divideRoundUp( 2 * primCount, 3 );
+	return getGeometryStorageBufferSize( primCount, boxNodeCount, primNodeSize );
 }
 
 HIPRT_DEVICE size_t getStorageBufferSize( const hiprtSceneBuildInput& buildInput )
 {
-	const size_t frameCount = buildInput.frameCount;
-	const size_t primCount	= buildInput.instanceCount;
-	const size_t nodeCount	= divideRoundUp( 2 * primCount, 3 );
-	return getSceneStorageBufferSize( primCount, nodeCount, frameCount );
+	const size_t frameCount	  = buildInput.frameCount;
+	const size_t primCount	  = buildInput.instanceCount;
+	const size_t boxNodeCount = divideRoundUp( 2 * primCount, 3 );
+	return getSceneStorageBufferSize( primCount, primCount, boxNodeCount, frameCount );
 }
 
 template <typename PrimitiveNode, typename PrimitiveContainer>
@@ -88,23 +88,12 @@ build( PrimitiveContainer& primitives, uint32_t geomType, MemoryArena& storageMe
 	// STEP 0: Init data
 	if constexpr ( is_same<Header, SceneHeader>::value )
 	{
-		Instance*			  instances	 = storageMemoryArena.allocate<Instance>( primitives.getCount() );
-		uint32_t*			  masks		 = storageMemoryArena.allocate<uint32_t>( primitives.getCount() );
-		hiprtTransformHeader* transforms = storageMemoryArena.allocate<hiprtTransformHeader>( primitives.getCount() );
-		Frame*				  frames	 = storageMemoryArena.allocate<Frame>( primitives.getFrameCount() );
+		Frame*	  frames	= storageMemoryArena.allocate<Frame>( primitives.getFrameCount() );
+		Instance* instances = storageMemoryArena.allocate<Instance>( primitives.getCount() );
 
 		primitives.setFrames( frames );
 		InitSceneData<>(
-			index,
-			storageMemoryArena.getStorageSize(),
-			primitives,
-			boxNodes,
-			primNodes,
-			instances,
-			masks,
-			transforms,
-			frames,
-			header );
+			index, storageMemoryArena.getStorageSize(), primitives, boxNodes, primNodes, instances, frames, header );
 	}
 	else
 	{
@@ -133,7 +122,7 @@ build( PrimitiveContainer& primitives, uint32_t geomType, MemoryArena& storageMe
 	uint32_t*	   mortonCodeKeys	= sharedMemoryArena.allocate<uint32_t>( blockDim.x );
 	uint32_t*	   mortonCodeValues = sharedMemoryArena.allocate<uint32_t>( blockDim.x );
 	uint32_t*	   updateCounters	= sharedMemoryArena.allocate<uint32_t>( blockDim.x );
-	int2*		   taskQueue		= sharedMemoryArena.allocate<int2>( blockDim.x );
+	int3*		   taskQueue		= sharedMemoryArena.allocate<int3>( blockDim.x );
 
 	// STEP 1: Calculate centroid bounding box by reduction
 	updateCounters[index] = InvalidValue;
@@ -173,27 +162,21 @@ build( PrimitiveContainer& primitives, uint32_t geomType, MemoryArena& storageMe
 	}
 
 	// STEP 4: Emit topology and refit nodes
-	EmitTopologyAndFitBounds(
-		index, mortonCodeKeys, mortonCodeValues, updateCounters, primitives, scratchNodes, references, primNodes );
+	EmitTopologyAndFitBounds( index, mortonCodeKeys, mortonCodeValues, updateCounters, primitives, scratchNodes, references );
 	__syncthreads();
 
 	// STEP 5: Collapse
 	uint32_t rootAddr = updateCounters[primCount - 1];
-	if ( index == 0 ) taskQueue[0] = make_int2( rootAddr, InvalidValue );
+	if ( index == 0 )
+		taskQueue[index] = make_int3( encodeNodeIndex( rootAddr, BoxType ), 0, 0 );
+	else
+		taskQueue[index] = make_int3( InvalidValue, InvalidValue, InvalidValue );
 	__syncthreads();
 
-	uint32_t taskCount	= 1;
-	uint32_t taskOffset = 0;
-	while ( taskCount > 0 )
-	{
-		DeviceCollapse( index, taskCount, taskOffset, header, scratchNodes, references, boxNodes, primNodes, taskQueue );
-		__syncthreads();
-
-		uint32_t nodeCount = header->m_boxNodeCount;
-		taskOffset += taskCount;
-		taskCount = nodeCount - taskOffset;
-		__syncthreads();
-	}
+	uint32_t* taskCounter = &updateCounters[0];
+	*taskCounter		  = 1;
+	__syncthreads();
+	Collapse( index, primCount, header, scratchNodes, references, boxNodes, primNodes, primitives, taskCounter, taskQueue );
 }
 
 extern "C" __global__ void
Original file line number	Diff line number	Diff line change
`@@ -34,17 +34,17 @@ DECLARE_TYPE_TRAITS( hiprtSceneBuildInput );`
`34`	`34`
`35`	`35`	`size_t BatchBuilder::getStorageBufferSize( const hiprtGeometryBuildInput& buildInput, const hiprtBuildOptions buildOptions )`
`36`	`36`	`{`
`37`		`- const size_t primCount = getPrimCount( buildInput );`
`38`		`- const size_t nodeSize = getNodeSize( buildInput );`
`39`		`- const size_t nodeCount = divideRoundUp( 2 * primCount, 3 );`
`40`		`- return getGeometryStorageBufferSize( primCount, nodeCount, nodeSize );`
	`37`	`+ const size_t primCount = getPrimCount( buildInput );`
	`38`	`+ const size_t primNodeSize = getPrimNodeSize( buildInput );`
	`39`	`+ const size_t boxNodeCount = divideRoundUp( 2 * primCount, 3 );`
	`40`	`+ return getGeometryStorageBufferSize( primCount, boxNodeCount, primNodeSize );`
`41`	`41`	`}`
`42`	`42`
`43`	`43`	`size_t BatchBuilder::getStorageBufferSize( const hiprtSceneBuildInput& buildInput, const hiprtBuildOptions buildOptions )`
`44`	`44`	`{`
`45`		`- const size_t frameCount = buildInput.frameCount;`
`46`		`- const size_t primCount = buildInput.instanceCount;`
`47`		`- const size_t nodeCount = divideRoundUp( 2 * primCount, 3 );`
`48`		`- return getSceneStorageBufferSize( primCount, nodeCount, frameCount );`
	`45`	`+ const size_t frameCount = buildInput.frameCount;`
	`46`	`+ const size_t primCount = buildInput.instanceCount;`
	`47`	`+ const size_t boxNodeCount = divideRoundUp( 2 * primCount, 3 );`
	`48`	`+ return getSceneStorageBufferSize( primCount, primCount, boxNodeCount, frameCount );`
`49`	`49`	`}`
`50`	`50`	`} // namespace hiprt`