Skip to content

Commit 592a92b

Browse files
committed
Update to 3.0.9df32e7
1 parent a1fc965 commit 592a92b

File tree

650 files changed

+174777
-3057
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

650 files changed

+174777
-3057
lines changed

.clang-format

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ BasedOnStyle: LLVM
22
IndentWidth: 4
33
TabWidth: 4
44
UseTab: Always
5-
BreakBeforeBraces: Custom
65
ColumnLimit: 128
76
AllowShortBlocksOnASingleLine: false
87
AllowShortIfStatementsOnASingleLine: true

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,4 @@ hiprt/cache/Kernels.h
1515
hiprt/cache/KernelArgs.h
1616
PUBLIC_OUT/
1717
hiprt/impl/bvh_build_array.h
18+
scripts/bitcodes/__pycache__/

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[submodule "contrib/zstd"]
2+
path = contrib/zstd
3+
url = https://github.com/facebook/zstd.git

CMakeLists.txt

Lines changed: 55 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
cmake_minimum_required(VERSION 3.10)
1+
cmake_minimum_required(VERSION 3.19) # at least 3.19 in order to have the compression-level in Zstd
22
project(hiprt)
33

44
#
@@ -15,6 +15,7 @@ option(HIPRTEW "Use hiprtew" OFF)
1515
option(NO_ENCRYPT "Don't encrypt kernel source and binaries" OFF)
1616
option(NO_UNITTEST "Don't build unit tests" OFF)
1717
option(HIPRT_PREFER_HIP_5 "Prefer HIP 5" OFF)
18+
option(COMPILED_COMPRESSION "enable compression of compiled kernels" ON) # this argument is only used if BAKE_COMPILED_KERNEL is enabled -- advised to let it 'ON' as it's the path tested by the HIPRT team.
1819

1920
option(FORCE_DISABLE_CUDA "By default Cuda support is automatically added if a Cuda install is detected. Turn this flag to ON to force Cuda to be disabled." OFF)
2021

@@ -388,6 +389,10 @@ set(KERNEL_HIPRT_COMP "${BASE_OUTPUT_DIR}/${CMAKE_BUILD_TYPE}/hiprt${version_
388389
set(KERNEL_UNITTEST_COMP "${BASE_OUTPUT_DIR}/${CMAKE_BUILD_TYPE}/hiprt${version_str_}_${HIP_VERSION_STR}_precompiled_bitcode_${KERNEL_OS_POSTFIX}.hipfb") # example: hiprt02005_6.2_precompiled_bitcode_win.hipfb
389390
set(KERNEL_OROCHI_COMP "${BASE_OUTPUT_DIR}/${CMAKE_BUILD_TYPE}/oro_compiled_kernels.hipfb")
390391

392+
# temp files: compiled kernel, compressed.
393+
set(KERNEL_HIPRT_COMP_COMPRESSED "${CMAKE_BINARY_DIR}/hiprt${version_str_}_${HIP_VERSION_STR}_amd.zstd" )
394+
set(KERNEL_OROCHI_COMP_COMPRESSED "${CMAKE_BINARY_DIR}/oro_compiled_kernels.zstd" )
395+
391396

392397
# precompile kernels:
393398
if(PRECOMPILE)
@@ -479,11 +484,16 @@ if ( BAKE_COMPILED_KERNEL )
479484

480485
set(PYTHON_FILE "${CMAKE_CURRENT_SOURCE_DIR}/contrib/Orochi/scripts/convert_binary_to_array.py")
481486

487+
set(ARCHIVE_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/contrib/Orochi/scripts/create_archive.cmake")
488+
482489
# HIPRT binary
483490
set(KERNEL_HIPRT_H "${CMAKE_CURRENT_SOURCE_DIR}/hiprt/impl/bvh_build_array.h")
484491
add_custom_command(
485492
OUTPUT ${KERNEL_HIPRT_H}
486-
COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_FILE} ${KERNEL_HIPRT_COMP} ${KERNEL_HIPRT_H}
493+
# 1) Create the Zstd archive
494+
COMMAND ${CMAKE_COMMAND} -DINPUT_FILE=${KERNEL_HIPRT_COMP} -DOUTPUT_FILE=${KERNEL_HIPRT_COMP_COMPRESSED} -DDO_COMPRESS=${COMPILED_COMPRESSION} -P ${ARCHIVE_SCRIPT}
495+
# 2) Run the Python converter on that archive
496+
COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_FILE} ${KERNEL_HIPRT_COMP} ${KERNEL_HIPRT_COMP_COMPRESSED} ${KERNEL_HIPRT_H} ${COMPILED_COMPRESSION}
487497
DEPENDS ${KERNEL_HIPRT_COMP} # Ensure compile.py has already run.
488498
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
489499
COMMENT "Converting HIPRT compiled kernel to header"
@@ -494,7 +504,10 @@ if ( BAKE_COMPILED_KERNEL )
494504
set(KERNEL_OROCHI_H "${CMAKE_CURRENT_SOURCE_DIR}/contrib/Orochi/ParallelPrimitives/cache/oro_compiled_kernels.h")
495505
add_custom_command(
496506
OUTPUT ${KERNEL_OROCHI_H}
497-
COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_FILE} ${KERNEL_OROCHI_COMP} ${KERNEL_OROCHI_H}
507+
# 1) Create the Zstd archive
508+
COMMAND ${CMAKE_COMMAND} -DINPUT_FILE=${KERNEL_OROCHI_COMP} -DOUTPUT_FILE=${KERNEL_OROCHI_COMP_COMPRESSED} -DDO_COMPRESS=${COMPILED_COMPRESSION} -P ${ARCHIVE_SCRIPT}
509+
# 2) Run the Python converter on that archive
510+
COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_FILE} ${KERNEL_OROCHI_COMP} ${KERNEL_OROCHI_COMP_COMPRESSED} ${KERNEL_OROCHI_H} ${COMPILED_COMPRESSION}
498511
DEPENDS ${KERNEL_OROCHI_COMP} # Ensure compile.py has already run.
499512
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
500513
COMMENT "Converting Orochi compiled kernel to header"
@@ -533,8 +546,40 @@ endif()
533546

534547

535548
if ( BAKE_COMPILED_KERNEL )
549+
550+
551+
if ( COMPILED_COMPRESSION )
552+
# Gather minimal Zstd sources
553+
file(GLOB ZSTD_SRCS
554+
contrib/zstd/lib/common/*.c
555+
contrib/zstd/lib/decompress/*.c
556+
)
557+
558+
# Build a static lib zstd_embedded
559+
add_library(zstd_embedded STATIC
560+
${ZSTD_SRCS}
561+
)
562+
563+
# Include Zstd headers
564+
target_include_directories(zstd_embedded
565+
PUBLIC
566+
contrib/zstd/lib
567+
)
568+
569+
set_target_properties(zstd_embedded PROPERTIES POSITION_INDEPENDENT_CODE ON) # -fPIC
570+
target_compile_definitions(zstd_embedded PRIVATE ZSTD_DISABLE_ASM) # disable ASM for easier build
571+
572+
# Link against zstd_embedded
573+
target_link_libraries(${HIPRT_NAME} zstd_embedded )
574+
575+
# the 'ORO_LINK_ZSTD' flag enables use of ZSTD API in the source code.
576+
target_compile_definitions(${HIPRT_NAME} PRIVATE ORO_LINK_ZSTD)
577+
endif()
578+
579+
580+
536581
# enable the 'BAKE_COMPILED_KERNEL' on Orochi: this mode is activated by adding those 2 defines.
537-
target_compile_definitions(${HIPRT_NAME} PRIVATE ORO_PP_LOAD_FROM_STRING ORO_PRECOMPILED)
582+
target_compile_definitions(${HIPRT_NAME} PRIVATE ORO_PP_LOAD_FROM_STRING HIPRT_BITCODE_LINKING ORO_PRECOMPILED)
538583

539584
#enable the 'BAKE_COMPILED_KERNEL' on HIPRT:
540585
target_compile_definitions(${HIPRT_NAME} PRIVATE HIPRT_BAKE_COMPILED_KERNEL )
@@ -592,12 +637,17 @@ if(PRECOMPILE AND NOT BAKE_COMPILED_KERNEL)
592637
DESTINATION bin)
593638
endif()
594639

640+
641+
642+
643+
644+
595645
# Project: Unit Test
596646
if(NOT NO_UNITTEST)
597647

598648
add_executable(unittest)
599649

600-
if(BITCODE)
650+
if(BITCODE OR BAKE_COMPILED_KERNEL)
601651
target_compile_definitions(unittest PRIVATE HIPRT_BITCODE_LINKING)
602652
endif()
603653
if(WIN32)

contrib/Orochi/Orochi/Orochi.cpp

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1219,6 +1219,21 @@ oroError OROAPI oroCtxGetCurrent(oroCtx* pctx)
12191219
if ( e != hipSuccess )
12201220
return hip2oro(e);
12211221
}
1222+
1223+
// externally initialized context
1224+
if( s_oroCtxs.count( ctxt->m_ptr ) == 0 && ctxt->m_ptr )
1225+
{
1226+
ioroCtx_t* c = new ioroCtx_t;
1227+
c->m_ptr = ctxt->m_ptr;
1228+
c->setApi( s_api );
1229+
s_oroCtxs[ctxt->m_ptr] = c;
1230+
}
1231+
1232+
if (s_oroCtxs.count(ctxt->m_ptr) == 0)
1233+
{
1234+
return oroErrorNotReady;
1235+
}
1236+
12221237
( *pctx ) = s_oroCtxs[ctxt->m_ptr];
12231238
delete ctxt;
12241239
return oroSuccess;
@@ -2627,11 +2642,11 @@ oroError_t OROAPI oroModuleLaunchCooperativeKernelMultiDevice(oroFunctionLaunchP
26272642
hipModuleLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags) );
26282643
return oroErrorUnknown;
26292644
}
2630-
oroError_t OROAPI oroModuleLaunchKernel(oroFunction_t f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, oroStream_t stream, void ** kernelParams, void ** extra)
2645+
oroError_t OROAPI oroModuleLaunchKernel(oroFunction_t f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, oroStream_t stream, const void* const* kernelParams, const void* const* extra)
26312646
{
26322647
__ORO_FUNC(
26332648
CU4ORO::hipModuleLaunchKernel_cu4oro(__ORO_FORCE_CAST(CU4ORO::hipFunction_t,f), __ORO_FORCE_CAST(unsigned int,gridDimX), __ORO_FORCE_CAST(unsigned int,gridDimY), __ORO_FORCE_CAST(unsigned int,gridDimZ), __ORO_FORCE_CAST(unsigned int,blockDimX), __ORO_FORCE_CAST(unsigned int,blockDimY), __ORO_FORCE_CAST(unsigned int,blockDimZ), __ORO_FORCE_CAST(unsigned int,sharedMemBytes), __ORO_FORCE_CAST(CU4ORO::hipStream_t,stream), __ORO_FORCE_CAST(void **,kernelParams), __ORO_FORCE_CAST(void **,extra)),
2634-
hipModuleLaunchKernel(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, stream, kernelParams, extra) );
2649+
hipModuleLaunchKernel(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, stream, __ORO_FORCE_CAST(void**, kernelParams), __ORO_FORCE_CAST(void**, extra)) );
26352650
return oroErrorUnknown;
26362651
}
26372652
oroError_t OROAPI oroModuleLoad(oroModule_t * module, const char * fname)

contrib/Orochi/Orochi/Orochi.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -991,7 +991,7 @@ oroError_t OROAPI oroModuleGetGlobal(oroDeviceptr_t * dptr, size_t * bytes, oroM
991991
oroError_t OROAPI oroModuleGetTexRef(textureReference ** texRef, oroModule_t hmod, const char * name);
992992
oroError_t OROAPI oroModuleLaunchCooperativeKernel(oroFunction_t f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, oroStream_t stream, void ** kernelParams);
993993
oroError_t OROAPI oroModuleLaunchCooperativeKernelMultiDevice(oroFunctionLaunchParams * launchParamsList, unsigned int numDevices, unsigned int flags);
994-
oroError_t OROAPI oroModuleLaunchKernel(oroFunction_t f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, oroStream_t stream, void ** kernelParams, void ** extra);
994+
oroError_t OROAPI oroModuleLaunchKernel(oroFunction_t f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, oroStream_t stream, const void* const* kernelParams, const void* const* extra);
995995
oroError_t OROAPI oroModuleLoad(oroModule_t * module, const char * fname);
996996
oroError_t OROAPI oroModuleLoadData(oroModule_t * module, const void * image);
997997
oroError_t OROAPI oroModuleLoadDataEx(oroModule_t * module, const void * image, unsigned int numOptions, oroJitOption * options, void ** optionValues);

contrib/Orochi/Orochi/OrochiUtils.cpp

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,10 @@
3636
#include <sys/stat.h>
3737
#endif
3838

39+
#ifdef ORO_LINK_ZSTD
40+
#include <contrib/zstd/lib/zstd.h>
41+
#endif
42+
3943
inline std::wstring utf8_to_wstring( const std::string& str )
4044
{
4145
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> myconv;
@@ -790,3 +794,45 @@ void OrochiUtils::launch2D( oroFunction func, int nx, int ny, const void** args,
790794
OROASSERT( e == oroSuccess, 0 );
791795
}
792796

797+
void OrochiUtils::HandlePrecompiled(std::vector<unsigned char>& out, const CompressedBuffer& buffer)
798+
{
799+
#ifdef ORO_LINK_ZSTD
800+
out.assign(buffer.uncompressedSize,0);
801+
802+
size_t decompressedSize = ZSTD_decompress(
803+
out.data(), // final uncompressed buffer
804+
out.size(), // final size
805+
buffer.data, // compressed buffer
806+
buffer.size // compressed buffer - size
807+
);
808+
809+
if ( decompressedSize != buffer.uncompressedSize )
810+
throw std::runtime_error( "ERROR: ZSTD_decompress FAILED." );
811+
#else
812+
throw std::runtime_error( "ERROR: ZSTD is not part of this build." );
813+
#endif
814+
return;
815+
}
816+
817+
818+
void OrochiUtils::HandlePrecompiled(std::vector<unsigned char>& out, const RawBuffer& buffer)
819+
{
820+
out = std::vector<unsigned char>(buffer.data, buffer.data + buffer.size );
821+
return;
822+
}
823+
824+
825+
void OrochiUtils::HandlePrecompiled(std::vector<unsigned char>& out, const unsigned char* rawData, size_t rawData_sizeByte, std::optional<size_t> uncompressed_sizeByte)
826+
{
827+
if (uncompressed_sizeByte.has_value()) {
828+
// if the input buffer is compressed :
829+
CompressedBuffer buffer{ rawData, rawData_sizeByte, uncompressed_sizeByte.value() };
830+
HandlePrecompiled(out, buffer );
831+
} else {
832+
// if the input buffer is not compressed
833+
RawBuffer buffer{ rawData, rawData_sizeByte };
834+
HandlePrecompiled(out, buffer );
835+
}
836+
}
837+
838+

contrib/Orochi/Orochi/OrochiUtils.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include <filesystem>
2828
#include <unordered_map>
2929
#include <vector>
30+
#include <optional>
3031

3132
#if defined( GNUC )
3233
#include <signal.h>
@@ -83,6 +84,20 @@ class OrochiUtils
8384
static void getModule( oroDevice device, const char* code, const char* path, std::vector<const char*>* optsIn, const char* funcName, oroModule* moduleOut );
8485
static void launch1D( oroFunction func, int nx, const void** args, int wgSize = 64, unsigned int sharedMemBytes = 0, oroStream stream = 0 );
8586
static void launch2D( oroFunction func, int nx, int ny, const void** args, int wgSizeX = 8, int wgSizeY = 8, unsigned int sharedMemBytes = 0, oroStream stream = 0 );
87+
88+
89+
struct CompressedBuffer {
90+
const unsigned char* data = nullptr; // compressed data
91+
size_t size = 0; // size in byte of 'data'
92+
size_t uncompressedSize = 0; // size of byte of the uncompressed data.
93+
};
94+
struct RawBuffer {
95+
const unsigned char* data = nullptr;
96+
size_t size = 0;
97+
};
98+
static void HandlePrecompiled(std::vector<unsigned char>& out, const CompressedBuffer& buffer);
99+
static void HandlePrecompiled(std::vector<unsigned char>& out, const RawBuffer& buffer);
100+
static void HandlePrecompiled(std::vector<unsigned char>& out, const unsigned char* rawData, size_t rawData_sizeByte, std::optional<size_t> uncompressed_sizeByte=std::nullopt);
86101

87102
template<typename T>
88103
static void malloc( T*& ptr, size_t n )

0 commit comments

Comments
 (0)