Skip to content

Commit 99a2f8f

Browse files
committed
update to HIPRT 2.4
1 parent 83e18cc commit 99a2f8f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+1178
-1214
lines changed

CHANGELOG.md

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
2.4.c587aa7
2+
- H-PLOC and improved wide BVH conversion
3+
- CMake support
4+
- Configurable HIPRT path via a env. variable
5+
- New gfx architectures supported
6+
- hiprtBuildTraceKernel can return only the HIP module
7+
- HIP module caching and unloading (fixing a memory leak)
8+
- Fixing matrix inversion and identity check
9+
- Fixing refit and other minor issues
10+
11+
2.3.7df94af
12+
- Transformation query API changed/extended
13+
14+
2.2.0e68f54 (December 2023)
15+
- Multi-level instancing
16+
- Triangle pairing
17+
- AS Compaction
18+
- Optimized BVH build speed
19+
20+
2.1.c202dac (November 2023)
21+
- HIPRT binaries compiled with ROCm 5.7
22+
- A fix for caching trace kernels
23+
- A fix for the custom function table compilation
24+
- A fix for the fast and balanced builders with custom streams
25+
26+
2.1.6fc8ff0 (September 2023)
27+
- Dynamic traversal stack assignment
28+
- Batch BVH construction
29+
- Transformation query functions
30+
- Improved BVH construction speed
31+
- Improved RT speed for transformed instances
32+
- Fixed geometry IO API
33+
- Optional trace kernel caching
34+
35+
2.0.3a134c7 (May 2023)
36+
- BVH memory optimization
37+
- SBVH speed optimization
38+
- Fixing hiprtBuildTraceKernels
39+
- Dynamic loading via HIPRTEW
40+
- Traversal optimization
41+
42+
2.0.0 (February 2023)
43+
- Bitcode and precompilation (527.41 or newer driver is necessary to run on NVIDIA® on Windows®)
44+
- Performance improvement
45+
- Navi3x support
46+
- MI60 and MI200 support
47+
- Traversal hints for better performance
48+
- Concurrent build via streams
49+
- Custom function table
50+
- Intersection filter
51+
- Transformation matrices support
52+
- Multiple templated kernels
53+
- Added ray t min

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,12 +31,12 @@ Then, you can use either premake or cmake.
3131

3232
   Example with Cmake on Windows:
3333
   5. `mkdir build`
34-
   6. `cmake -DCMAKE_BUILD_TYPE=Release -DBITCODE=OFF -S . -B build`
34+
   6. `cmake -DCMAKE_BUILD_TYPE=Release -DBITCODE=OFF -DHIP_PATH="C:\Program Files\AMD\ROCm\5.7" -S . -B build`
3535
   7. `Open build\hiprt.sln with Visual Studio 2022.`
3636

3737
   Example with Cmake on Linux:
3838
   5. `mkdir build`
39-
   6. `cmake -DCMAKE_BUILD_TYPE=Release -DBITCODE=OFF -S . -B build`
39+
   6. `cmake -DCMAKE_BUILD_TYPE=Release -DBITCODE=OFF -DHIP_PATH="/opt/rocm" -S . -B build`
4040
   7. `cmake --build build --config Release`
4141

4242

@@ -47,7 +47,7 @@ Add the option `--bitcode` in premake, or `-DBITCODE=ON` in cmake to enable prec
4747

4848
#### Generation of bitcode
4949
- After premake, go to `scripts/bitcodes`, then run `python compile.py` which compiles kernels to bitcode and fatbinary.
50-
- Or pass `--precompile` to premake. it executes the `compile.py` during premake. Note that you cannot do it in git bash on windows (because of hipcc...)
50+
- Or pass `--precompile` to premake, or `-DPRECOMPILE=ON` in cmake . It executes the `compile.py` during premake. Note that you cannot do it in git bash on windows (because of hipcc...)
5151

5252

5353
## Running Unit Tests

hiprt/hiprt_common.h

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@
3737
#include <cfloat>
3838
#include <cstring>
3939
#include <cmath>
40-
#include <cstdint>
4140
#include <map>
4241
#include <string>
4342
#include <vector>
@@ -48,6 +47,10 @@
4847
#define __device__
4948
#endif
5049

50+
#if !defined( __KERNELCC_RTC__ )
51+
#include <cstdint>
52+
#endif
53+
5154
#ifdef __CUDACC__
5255
// Switch to sync counterparts as CUDA recently deprecated the non-sync ones
5356
#define __shfl( x, y ) __shfl_sync( __activemask(), ( x ), ( y ) )
@@ -92,7 +95,6 @@
9295
#define HIPRT_DEVICE __device__
9396
#define HIPRT_HOST_DEVICE __host__ __device__
9497

95-
// TODO: cleanup after baking is removed
9698
#if defined( HIPRT_BAKE_KERNEL_GENERATED )
9799
#define GET_ARGS( X ) ( hip::X##Args )
98100
#define GET_INC( X ) ( hip::X##Includes )
@@ -173,6 +175,7 @@ constexpr uint32_t FullRayMask = ~0u;
173175
constexpr uint32_t MaxBatchBuildMaxPrimCount = 512u;
174176
constexpr uint32_t MaxInstanceLevels = 4u;
175177
constexpr uint32_t BranchingFactor = 4u;
178+
constexpr uint32_t DefaultAlignment = 64u;
176179

177180
#ifdef __KERNELCC__
178181
#if __gfx900__ || __gfx902__ || __gfx904__ || __gfx906__ || __gfx908__ || __gfx909__ || __gfx90a__ || __gfx90c__ || \
@@ -413,9 +416,9 @@ enum TraversalObjSize
413416
SizePrivateInstanceStack = 160,
414417
SizeGlobalInstanceStack = 48,
415418
SizeGeomTraversalCustomStack = 128,
416-
SizeSceneTraversalCustomStack = 192,
419+
SizeSceneTraversalCustomStack = 176,
417420
SizeGeomTraversalPrivateStack = 400,
418-
SizeSceneTraversalPrivateStack = 624,
421+
SizeSceneTraversalPrivateStack = 608,
419422
};
420423

421424
enum TraversalObjAlignment

hiprt/hiprt_libpath.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,9 @@
3232
#ifdef _WIN32
3333

3434
#ifdef HIPRT_PREFER_HIP_5
35-
const char* g_hip_paths[] = { "amdhip64.dll", "amdhip64_6.dll", NULL };
35+
const char* g_hip_paths[] = { "amdhip64.dll", "amdhip64_6.dll", NULL };
3636
#else
37-
const char* g_hip_paths[] = { "amdhip64_6.dll", "amdhip64.dll", NULL };
37+
const char* g_hip_paths[] = { "amdhip64_6.dll", "amdhip64.dll", NULL };
3838
#endif
3939

4040
const char* g_hiprtc_paths[] = {

hiprt/impl/BatchBuilder.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -34,17 +34,17 @@ DECLARE_TYPE_TRAITS( hiprtSceneBuildInput );
3434

3535
size_t BatchBuilder::getStorageBufferSize( const hiprtGeometryBuildInput& buildInput, const hiprtBuildOptions buildOptions )
3636
{
37-
const size_t primCount = getPrimCount( buildInput );
38-
const size_t nodeSize = getNodeSize( buildInput );
39-
const size_t nodeCount = divideRoundUp( 2 * primCount, 3 );
40-
return getGeometryStorageBufferSize( primCount, nodeCount, nodeSize );
37+
const size_t primCount = getPrimCount( buildInput );
38+
const size_t primNodeSize = getPrimNodeSize( buildInput );
39+
const size_t boxNodeCount = divideRoundUp( 2 * primCount, 3 );
40+
return getGeometryStorageBufferSize( primCount, boxNodeCount, primNodeSize );
4141
}
4242

4343
size_t BatchBuilder::getStorageBufferSize( const hiprtSceneBuildInput& buildInput, const hiprtBuildOptions buildOptions )
4444
{
45-
const size_t frameCount = buildInput.frameCount;
46-
const size_t primCount = buildInput.instanceCount;
47-
const size_t nodeCount = divideRoundUp( 2 * primCount, 3 );
48-
return getSceneStorageBufferSize( primCount, nodeCount, frameCount );
45+
const size_t frameCount = buildInput.frameCount;
46+
const size_t primCount = buildInput.instanceCount;
47+
const size_t boxNodeCount = divideRoundUp( 2 * primCount, 3 );
48+
return getSceneStorageBufferSize( primCount, primCount, boxNodeCount, frameCount );
4949
}
5050
} // namespace hiprt

hiprt/impl/BatchBuilderKernels.h

Lines changed: 22 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -54,22 +54,22 @@ static constexpr size_t CacheSize = RoundUp( ( BatchBuilderMaxBlockSize - 1 ) *
5454
RoundUp( ( BatchBuilderMaxBlockSize ) * sizeof( ReferenceNode ), CacheAlignment ) +
5555
2 * RoundUp( BatchBuilderMaxBlockSize * sizeof( uint32_t ), CacheAlignment ) +
5656
RoundUp( BatchBuilderMaxBlockSize * sizeof( uint32_t ), CacheAlignment ) +
57-
RoundUp( BatchBuilderMaxBlockSize * sizeof( int2 ), CacheAlignment );
57+
RoundUp( BatchBuilderMaxBlockSize * sizeof( int3 ), CacheAlignment );
5858

5959
HIPRT_DEVICE size_t getStorageBufferSize( const hiprtGeometryBuildInput& buildInput )
6060
{
61-
const size_t primCount = getPrimCount( buildInput );
62-
const size_t nodeSize = getNodeSize( buildInput );
63-
const size_t nodeCount = divideRoundUp( 2 * primCount, 3 );
64-
return getGeometryStorageBufferSize( primCount, nodeCount, nodeSize );
61+
const size_t primCount = getPrimCount( buildInput );
62+
const size_t primNodeSize = getPrimNodeSize( buildInput );
63+
const size_t boxNodeCount = divideRoundUp( 2 * primCount, 3 );
64+
return getGeometryStorageBufferSize( primCount, boxNodeCount, primNodeSize );
6565
}
6666

6767
HIPRT_DEVICE size_t getStorageBufferSize( const hiprtSceneBuildInput& buildInput )
6868
{
69-
const size_t frameCount = buildInput.frameCount;
70-
const size_t primCount = buildInput.instanceCount;
71-
const size_t nodeCount = divideRoundUp( 2 * primCount, 3 );
72-
return getSceneStorageBufferSize( primCount, nodeCount, frameCount );
69+
const size_t frameCount = buildInput.frameCount;
70+
const size_t primCount = buildInput.instanceCount;
71+
const size_t boxNodeCount = divideRoundUp( 2 * primCount, 3 );
72+
return getSceneStorageBufferSize( primCount, primCount, boxNodeCount, frameCount );
7373
}
7474

7575
template <typename PrimitiveNode, typename PrimitiveContainer>
@@ -88,23 +88,12 @@ build( PrimitiveContainer& primitives, uint32_t geomType, MemoryArena& storageMe
8888
// STEP 0: Init data
8989
if constexpr ( is_same<Header, SceneHeader>::value )
9090
{
91-
Instance* instances = storageMemoryArena.allocate<Instance>( primitives.getCount() );
92-
uint32_t* masks = storageMemoryArena.allocate<uint32_t>( primitives.getCount() );
93-
hiprtTransformHeader* transforms = storageMemoryArena.allocate<hiprtTransformHeader>( primitives.getCount() );
94-
Frame* frames = storageMemoryArena.allocate<Frame>( primitives.getFrameCount() );
91+
Frame* frames = storageMemoryArena.allocate<Frame>( primitives.getFrameCount() );
92+
Instance* instances = storageMemoryArena.allocate<Instance>( primitives.getCount() );
9593

9694
primitives.setFrames( frames );
9795
InitSceneData<>(
98-
index,
99-
storageMemoryArena.getStorageSize(),
100-
primitives,
101-
boxNodes,
102-
primNodes,
103-
instances,
104-
masks,
105-
transforms,
106-
frames,
107-
header );
96+
index, storageMemoryArena.getStorageSize(), primitives, boxNodes, primNodes, instances, frames, header );
10897
}
10998
else
11099
{
@@ -133,7 +122,7 @@ build( PrimitiveContainer& primitives, uint32_t geomType, MemoryArena& storageMe
133122
uint32_t* mortonCodeKeys = sharedMemoryArena.allocate<uint32_t>( blockDim.x );
134123
uint32_t* mortonCodeValues = sharedMemoryArena.allocate<uint32_t>( blockDim.x );
135124
uint32_t* updateCounters = sharedMemoryArena.allocate<uint32_t>( blockDim.x );
136-
int2* taskQueue = sharedMemoryArena.allocate<int2>( blockDim.x );
125+
int3* taskQueue = sharedMemoryArena.allocate<int3>( blockDim.x );
137126

138127
// STEP 1: Calculate centroid bounding box by reduction
139128
updateCounters[index] = InvalidValue;
@@ -173,27 +162,21 @@ build( PrimitiveContainer& primitives, uint32_t geomType, MemoryArena& storageMe
173162
}
174163

175164
// STEP 4: Emit topology and refit nodes
176-
EmitTopologyAndFitBounds(
177-
index, mortonCodeKeys, mortonCodeValues, updateCounters, primitives, scratchNodes, references, primNodes );
165+
EmitTopologyAndFitBounds( index, mortonCodeKeys, mortonCodeValues, updateCounters, primitives, scratchNodes, references );
178166
__syncthreads();
179167

180168
// STEP 5: Collapse
181169
uint32_t rootAddr = updateCounters[primCount - 1];
182-
if ( index == 0 ) taskQueue[0] = make_int2( rootAddr, InvalidValue );
170+
if ( index == 0 )
171+
taskQueue[index] = make_int3( encodeNodeIndex( rootAddr, BoxType ), 0, 0 );
172+
else
173+
taskQueue[index] = make_int3( InvalidValue, InvalidValue, InvalidValue );
183174
__syncthreads();
184175

185-
uint32_t taskCount = 1;
186-
uint32_t taskOffset = 0;
187-
while ( taskCount > 0 )
188-
{
189-
DeviceCollapse( index, taskCount, taskOffset, header, scratchNodes, references, boxNodes, primNodes, taskQueue );
190-
__syncthreads();
191-
192-
uint32_t nodeCount = header->m_boxNodeCount;
193-
taskOffset += taskCount;
194-
taskCount = nodeCount - taskOffset;
195-
__syncthreads();
196-
}
176+
uint32_t* taskCounter = &updateCounters[0];
177+
*taskCounter = 1;
178+
__syncthreads();
179+
Collapse( index, primCount, header, scratchNodes, references, boxNodes, primNodes, primitives, taskCounter, taskQueue );
197180
}
198181

199182
extern "C" __global__ void

0 commit comments

Comments
 (0)