From b9754b6c38b4d3df03134a567e9855eeb44b253f Mon Sep 17 00:00:00 2001 From: Adel Johar Date: Mon, 3 Feb 2025 09:14:25 +0100 Subject: [PATCH 01/32] Docs: remove virtual_rocr.rst --- .../memory_management/virtual_memory.rst | 4 +++ docs/index.md | 1 - docs/reference/virtual_rocr.rst | 35 ------------------- docs/sphinx/_toc.yml.in | 1 - 4 files changed, 4 insertions(+), 37 deletions(-) delete mode 100644 docs/reference/virtual_rocr.rst diff --git a/docs/how-to/hip_runtime_api/memory_management/virtual_memory.rst b/docs/how-to/hip_runtime_api/memory_management/virtual_memory.rst index 91f951b296..b771b8c902 100644 --- a/docs/how-to/hip_runtime_api/memory_management/virtual_memory.rst +++ b/docs/how-to/hip_runtime_api/memory_management/virtual_memory.rst @@ -25,6 +25,10 @@ issue of reallocation when the extra buffer runs out. Virtual memory management solves these memory management problems. It helps to reduce memory usage and unnecessary ``memcpy`` calls. +HIP virtual memory management is built on top of HSA, which provides low-level +access to AMD GPU memory. For more details on the underlying HSA runtime, +see :doc:`ROCr documentation ` + .. _memory_allocation_virtual_memory: Memory allocation diff --git a/docs/index.md b/docs/index.md index 7678aaae79..247c58e2fd 100644 --- a/docs/index.md +++ b/docs/index.md @@ -42,7 +42,6 @@ The HIP documentation is organized into the following categories: :::{grid-item-card} Reference * [HIP runtime API](./reference/hip_runtime_api_reference) -* [HSA runtime API for ROCm](./reference/virtual_rocr) * [HIP math API](./reference/math_api) * [HIP environment variables](./reference/env_variables) * [CUDA to HIP API Function Comparison](./reference/api_syntax) diff --git a/docs/reference/virtual_rocr.rst b/docs/reference/virtual_rocr.rst deleted file mode 100644 index 7510e6f78d..0000000000 --- a/docs/reference/virtual_rocr.rst +++ /dev/null @@ -1,35 +0,0 @@ -.. meta:: - :description: This chapter lists user-mode API interfaces and libraries - necessary for host applications to launch compute kernels to - available HSA ROCm kernel agents. - :keywords: AMD, ROCm, HIP, HSA, ROCR runtime, virtual memory management - -******************************************************************************* -HSA runtime API for ROCm -******************************************************************************* - -The following functions are located in the https://github.com/ROCm/ROCR-Runtime repository. - -.. doxygenfunction:: hsa_amd_vmem_address_reserve - -.. doxygenfunction:: hsa_amd_vmem_address_free - -.. doxygenfunction:: hsa_amd_vmem_handle_create - -.. doxygenfunction:: hsa_amd_vmem_handle_release - -.. doxygenfunction:: hsa_amd_vmem_map - -.. doxygenfunction:: hsa_amd_vmem_unmap - -.. doxygenfunction:: hsa_amd_vmem_set_access - -.. doxygenfunction:: hsa_amd_vmem_get_access - -.. doxygenfunction:: hsa_amd_vmem_export_shareable_handle - -.. doxygenfunction:: hsa_amd_vmem_import_shareable_handle - -.. doxygenfunction:: hsa_amd_vmem_retain_alloc_handle - -.. doxygenfunction:: hsa_amd_vmem_get_alloc_properties_from_handle diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index ed0d7f914d..34050b2448 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -108,7 +108,6 @@ subtrees: - file: reference/hip_runtime_api/global_defines_enums_structs_files/driver_types - file: doxygen/html/annotated - file: doxygen/html/files - - file: reference/virtual_rocr - file: reference/math_api - file: reference/env_variables - file: reference/api_syntax From 6ba4f06eb758bd64c660784cb6db173773057285 Mon Sep 17 00:00:00 2001 From: Istvan Kiss Date: Wed, 12 Feb 2025 11:54:13 +0100 Subject: [PATCH 02/32] Fix documentation warnings --- docs/how-to/hip_runtime_api/asynchronous.rst | 2 +- .../memory_management/device_memory.rst | 48 +++++++++---------- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/docs/how-to/hip_runtime_api/asynchronous.rst b/docs/how-to/hip_runtime_api/asynchronous.rst index 81769da48e..82c024969f 100644 --- a/docs/how-to/hip_runtime_api/asynchronous.rst +++ b/docs/how-to/hip_runtime_api/asynchronous.rst @@ -136,7 +136,7 @@ This overlap of computation and data transfer ensures that the GPU is not idle while waiting for data. :cpp:func:`hipMemcpyPeerAsync` enables data transfers between different GPUs, facilitating multi-GPU communication. -:ref:`async_example`` include launching kernels in one stream while performing +:ref:`async_example` include launching kernels in one stream while performing data transfers in another. This technique is especially useful in applications with large data sets that need to be processed quickly. diff --git a/docs/how-to/hip_runtime_api/memory_management/device_memory.rst b/docs/how-to/hip_runtime_api/memory_management/device_memory.rst index 13fba386bb..54651a3f9f 100644 --- a/docs/how-to/hip_runtime_api/memory_management/device_memory.rst +++ b/docs/how-to/hip_runtime_api/memory_management/device_memory.rst @@ -69,34 +69,34 @@ better option, but is also limited in size. .. code-block:: cpp __global__ void kernel_memory_allocation(TYPE* pointer){ - // The pointer is stored in shared memory, so that all - // threads of the block can access the pointer - __shared__ int *memory; - - size_t blockSize = blockDim.x; - constexpr size_t elementsPerThread = 1024; - if(threadIdx.x == 0){ - // allocate memory in one contiguous block - memory = new int[blockDim.x * elementsPerThread]; - } - __syncthreads(); + // The pointer is stored in shared memory, so that all + // threads of the block can access the pointer + __shared__ int *memory; + + size_t blockSize = blockDim.x; + constexpr size_t elementsPerThread = 1024; + if(threadIdx.x == 0){ + // allocate memory in one contiguous block + memory = new int[blockDim.x * elementsPerThread]; + } + __syncthreads(); - // load pointer into thread-local variable to avoid - // unnecessary accesses to shared memory - int *localPtr = memory; + // load pointer into thread-local variable to avoid + // unnecessary accesses to shared memory + int *localPtr = memory; - // work with allocated memory, e.g. initialization - for(int i = 0; i < elementsPerThread; ++i){ - // access in a contiguous way - localPtr[i * blockSize + threadIdx.x] = i; - } + // work with allocated memory, e.g. initialization + for(int i = 0; i < elementsPerThread; ++i){ + // access in a contiguous way + localPtr[i * blockSize + threadIdx.x] = i; + } - // synchronize to make sure no thread is accessing the memory before freeing - __syncthreads(); - if(threadIdx.x == 0){ - delete[] memory; + // synchronize to make sure no thread is accessing the memory before freeing + __syncthreads(); + if(threadIdx.x == 0){ + delete[] memory; + } } -} Copying between device and host -------------------------------------------------------------------------------- From 370ad8983dfedd86bde315342b6a3f1261343dbc Mon Sep 17 00:00:00 2001 From: Istvan Kiss Date: Wed, 12 Feb 2025 15:00:51 +0100 Subject: [PATCH 03/32] Reformat HIP RTC --- docs/how-to/hip_rtc.md | 535 ----------------------------- docs/how-to/hip_rtc.rst | 726 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 726 insertions(+), 535 deletions(-) delete mode 100644 docs/how-to/hip_rtc.md create mode 100644 docs/how-to/hip_rtc.rst diff --git a/docs/how-to/hip_rtc.md b/docs/how-to/hip_rtc.md deleted file mode 100644 index 14584828be..0000000000 --- a/docs/how-to/hip_rtc.md +++ /dev/null @@ -1,535 +0,0 @@ - - - - - - -# Programming for HIP runtime compiler (RTC) - -HIP lets you compile kernels at runtime with the `hiprtc*` APIs. -Kernels can be stored as a text string and can be passed to HIPRTC APIs alongside options to guide the compilation. - -:::{note} - -* This library can be used on systems without HIP installed nor AMD GPU driver installed at all (offline compilation). Therefore, it doesn't depend on any HIP runtime library. -* This library depends on Code Object Manager (comgr). You can try to statically link comgr into HIPRTC to avoid ambiguity. -* Developers can bundle this library with their application. - -::: - -## Compilation APIs - -To use HIPRTC functionality, HIPRTC header needs to be included first. -`#include ` - -Kernels can be stored in a string: - -```cpp -static constexpr auto kernel_source { -R"( - extern "C" - __global__ void vector_add(float* output, float* input1, float* input2, size_t size) { - int i = threadIdx.x; - if (i < size) { - output[i] = input1[i] + input2[i]; - } - } -)"}; -``` - -Now to compile this kernel, it needs to be associated with `hiprtcProgram` type, which is done by declaring `hiprtcProgram prog;` and associating the string of kernel with this program: - -```cpp -hiprtcCreateProgram(&prog, // HIPRTC program handle - kernel_source, // HIP kernel source string - "vector_add.cpp", // Name of the HIP program, can be null or an empty string - 0, // Number of headers - NULL, // Header sources - NULL); // Name of header files -``` - -`hiprtcCreateProgram` API also allows you to add headers which can be included in your RTC program. -For online compilation, the compiler pre-defines HIP device API functions, HIP specific types and macros for device compilation, but does not include standard C/C++ headers by default. Users can only include header files provided to `hiprtcCreateProgram`. - -After associating the kernel string with `hiprtcProgram`, you can now compile this program using: - -```cpp -hiprtcCompileProgram(prog, // hiprtcProgram - 0, // Number of options - options); // Clang Options [Supported Clang Options](clang_options.md) -``` - -`hiprtcCompileProgram` returns a status value which can be converted to string via `hiprtcGetErrorString`. If compilation is successful, `hiprtcCompileProgram` will return `HIPRTC_SUCCESS`. - -If the compilation fails, you can look up the logs via: - -```cpp -size_t logSize; -hiprtcGetProgramLogSize(prog, &logSize); - -if (logSize) { - string log(logSize, '\0'); - hiprtcGetProgramLog(prog, &log[0]); - // Corrective action with logs -} -``` - -If the compilation is successful, you can load the compiled binary in a local variable. - -```cpp -size_t codeSize; -hiprtcGetCodeSize(prog, &codeSize); - -vector kernel_binary(codeSize); -hiprtcGetCode(prog, kernel_binary.data()); -``` - -After loading the binary, `hiprtcProgram` can be destroyed. -`hiprtcDestroyProgram(&prog);` - -The binary present in `kernel_binary` can now be loaded via `hipModuleLoadData` API. - -```cpp -hipModule_t module; -hipFunction_t kernel; - -hipModuleLoadData(&module, kernel_binary.data()); -hipModuleGetFunction(&kernel, module, "vector_add"); -``` - -And now this kernel can be launched via `hipModule` APIs. - -The full example is below: - -```cpp -#include -#include - -#include -#include -#include - -#define CHECK_RET_CODE(call, ret_code) \ - { \ - if ((call) != ret_code) { \ - std::cout << "Failed in call: " << #call << std::endl; \ - std::abort(); \ - } \ - } -#define HIP_CHECK(call) CHECK_RET_CODE(call, hipSuccess) -#define HIPRTC_CHECK(call) CHECK_RET_CODE(call, HIPRTC_SUCCESS) - -// source code for hiprtc -static constexpr auto kernel_source{ - R"( - extern "C" - __global__ void vector_add(float* output, float* input1, float* input2, size_t size) { - int i = threadIdx.x; - if (i < size) { - output[i] = input1[i] + input2[i]; - } - } -)"}; - -int main() { - hiprtcProgram prog; - auto rtc_ret_code = hiprtcCreateProgram(&prog, // HIPRTC program handle - kernel_source, // kernel source string - "vector_add.cpp", // Name of the file - 0, // Number of headers - NULL, // Header sources - NULL); // Name of header file - - if (rtc_ret_code != HIPRTC_SUCCESS) { - std::cout << "Failed to create program" << std::endl; - std::abort(); - } - - hipDeviceProp_t props; - int device = 0; - HIP_CHECK(hipGetDeviceProperties(&props, device)); - std::string sarg = std::string("--gpu-architecture=") + - props.gcnArchName; // device for which binary is to be generated - - const char* options[] = {sarg.c_str()}; - - rtc_ret_code = hiprtcCompileProgram(prog, // hiprtcProgram - 0, // Number of options - options); // Clang Options - if (rtc_ret_code != HIPRTC_SUCCESS) { - std::cout << "Failed to create program" << std::endl; - std::abort(); - } - - size_t logSize; - HIPRTC_CHECK(hiprtcGetProgramLogSize(prog, &logSize)); - - if (logSize) { - std::string log(logSize, '\0'); - HIPRTC_CHECK(hiprtcGetProgramLog(prog, &log[0])); - std::cout << "Compilation failed with: " << log << std::endl; - std::abort(); - } - - size_t codeSize; - HIPRTC_CHECK(hiprtcGetCodeSize(prog, &codeSize)); - - std::vector kernel_binary(codeSize); - HIPRTC_CHECK(hiprtcGetCode(prog, kernel_binary.data())); - - HIPRTC_CHECK(hiprtcDestroyProgram(&prog)); - - hipModule_t module; - hipFunction_t kernel; - - HIP_CHECK(hipModuleLoadData(&module, kernel_binary.data())); - HIP_CHECK(hipModuleGetFunction(&kernel, module, "vector_add")); - - constexpr size_t ele_size = 256; // total number of items to add - std::vector hinput, output; - hinput.reserve(ele_size); - output.reserve(ele_size); - for (size_t i = 0; i < ele_size; i++) { - hinput.push_back(static_cast(i + 1)); - output.push_back(0.0f); - } - - float *dinput1, *dinput2, *doutput; - HIP_CHECK(hipMalloc(&dinput1, sizeof(float) * ele_size)); - HIP_CHECK(hipMalloc(&dinput2, sizeof(float) * ele_size)); - HIP_CHECK(hipMalloc(&doutput, sizeof(float) * ele_size)); - - HIP_CHECK(hipMemcpy(dinput1, hinput.data(), sizeof(float) * ele_size, hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(dinput2, hinput.data(), sizeof(float) * ele_size, hipMemcpyHostToDevice)); - - struct { - float* output; - float* input1; - float* input2; - size_t size; - } args{doutput, dinput1, dinput2, ele_size}; - - auto size = sizeof(args); - void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args, HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, - HIP_LAUNCH_PARAM_END}; - - HIP_CHECK(hipModuleLaunchKernel(kernel, 1, 1, 1, ele_size, 1, 1, 0, nullptr, nullptr, config)); - - HIP_CHECK(hipMemcpy(output.data(), doutput, sizeof(float) * ele_size, hipMemcpyDeviceToHost)); - - for (size_t i = 0; i < ele_size; i++) { - if ((hinput[i] + hinput[i]) != output[i]) { - std::cout << "Failed in validation: " << (hinput[i] + hinput[i]) << " - " << output[i] - << std::endl; - std::abort(); - } - } - std::cout << "Passed" << std::endl; - - HIP_CHECK(hipFree(dinput1)); - HIP_CHECK(hipFree(dinput2)); - HIP_CHECK(hipFree(doutput)); -} -``` - -## Kernel Compilation Cache - -HIPRTC incorporates a cache to avoid recompiling kernels between program executions. The contents of the cache include the kernel source code (including the contents of any `#include` headers), the compilation flags, and the compiler version. After a ROCm version update, the kernels are progressively recompiled, and the new results are cached. When the cache is disabled, each kernel is recompiled every time it is requested. - -Use the following environment variables to manage the cache status as enabled or disabled, the location for storing the cache contents, and the cache eviction policy: - -* `AMD_COMGR_CACHE` By default this variable has a value of `0` and the compilation cache feature is disabled. To enable the feature set the environment variable to a value of `1` (or any value other than `0`). This behavior may change in a future release. - -* `AMD_COMGR_CACHE_DIR`: By default the value of this environment variable is defined as `$XDG_CACHE_HOME/comgr_cache`, which defaults to `$USER/.cache/comgr_cache` on Linux, and `%LOCALAPPDATA%\cache\comgr_cache` on Windows. You can specify a different directory for the environment variable to change the path for cache storage. If the runtime fails to access the specified cache directory, or the environment variable is set to an empty string (""), the cache is disabled. - -* `AMD_COMGR_CACHE_POLICY`: If assigned a value, the string is interpreted and applied to the cache pruning policy. The string format is consistent with [Clang's ThinLTO cache pruning policy](https://rocm.docs.amd.com/projects/llvm-project/en/latest/LLVM/clang/html/ThinLTO.html#cache-pruning). The default policy is defined as: `prune_interval=1h:prune_expiration=0h:cache_size=75%:cache_size_bytes=30g:cache_size_files=0`. If the runtime fails to parse the defined string, or the environment variable is set to an empty string (""), the cache is disabled. - -:::{note} - This cache is also shared with the OpenCL runtime shipped with ROCm. -::: - -## HIPRTC specific options - -HIPRTC provides a few HIPRTC specific flags - -* `--gpu-architecture` : This flag can guide the code object generation for a specific gpu arch. Example: `--gpu-architecture=gfx906:sramecc+:xnack-`, its equivalent to `--offload-arch`. - * This option is compulsory if compilation is done on a system without AMD GPUs supported by HIP runtime. - * Otherwise, HIPRTC will load the hip runtime and gather the current device and its architecture info and use it as option. -* `-fgpu-rdc` : This flag when provided during the `hiprtcCompileProgram` generates the bitcode (HIPRTC doesn't convert this bitcode into ISA and binary). This bitcode can later be fetched using `hiprtcGetBitcode` and `hiprtcGetBitcodeSize` APIs. - -### Bitcode - -In the usual scenario, the kernel associated with `hiprtcProgram` is compiled into the binary which can be loaded and run. However, if `-fpu-rdc` option is provided in the compile options, HIPRTC calls comgr and generates only the LLVM bitcode. It doesn't convert this bitcode to ISA and generate the final binary. - -```cpp -std::string sarg = std::string("-fgpu-rdc"); -const char* options[] = { - sarg.c_str() }; -hiprtcCompileProgram(prog, // hiprtcProgram - 1, // Number of options - options); -``` - -If the compilation is successful, one can load the bitcode in a local variable using the bitcode APIs provided by HIPRTC. - -```cpp -size_t bitCodeSize; -hiprtcGetBitcodeSize(prog, &bitCodeSize); - -vector kernel_bitcode(bitCodeSize); -hiprtcGetBitcode(prog, kernel_bitcode.data()); -``` - -### CU Mode vs WGP mode - -AMD GPUs consist of an array of workgroup processors, each built with 2 compute units (CUs) capable of executing SIMD32. All the CUs inside a workgroup processor use local data share (LDS). - -gfx10+ support execution of wavefront in CU mode and work-group processor mode (WGP). Please refer to section 2.3 of [RDNA3 ISA reference](https://www.amd.com/content/dam/amd/en/documents/radeon-tech-docs/instruction-set-architectures/rdna3-shader-instruction-set-architecture-feb-2023_0.pdf). - -gfx9 and below only supports CU mode. - -In WGP mode, 4 warps of a block can simultaneously be executed on the workgroup processor, where as in CU mode only 2 warps of a block can simultaneously execute on a CU. In theory, WGP mode might help with occupancy and increase the performance of certain HIP programs (if not bound to inter warp communication), but might incur performance penalty on other HIP programs which rely on atomics and inter warp communication. This also has effect of how the LDS is split between warps, please refer to [RDNA3 ISA reference](https://www.amd.com/content/dam/amd/en/documents/radeon-tech-docs/instruction-set-architectures/rdna3-shader-instruction-set-architecture-feb-2023_0.pdf) for more information. - -HIPRTC assumes **WGP mode by default** for gfx10+. This can be overridden by passing `-mcumode` to HIPRTC compile options in `hiprtcCompileProgram`. - -## Linker APIs - -The bitcode generated using the HIPRTC Bitcode APIs can be loaded using `hipModule` APIs and also can be linked with other generated bitcodes with appropriate linker flags using the HIPRTC linker APIs. This also provides more flexibility and optimizations to the applications who want to generate the binary dynamically according to their needs. The input bitcodes can be generated only for a specific architecture or it can be a bundled bitcode which is generated for multiple architectures. - -### Example - -Firstly, HIPRTC link instance or a pending linker invocation must be created using `hiprtcLinkCreate`, with the appropriate linker options provided. - -```cpp -hiprtcLinkCreate( num_options, // number of options - options, // Array of options - option_vals, // Array of option values cast to void* - &rtc_link_state ); // HIPRTC link state created upon success -``` - -Following which, the bitcode data can be added to this link instance via `hiprtcLinkAddData` (if the data is present as a string) or `hiprtcLinkAddFile` (if the data is present as a file) with the appropriate input type according to the data or the bitcode used. - -```cpp -hiprtcLinkAddData(rtc_link_state, // HIPRTC link state - input_type, // type of the input data or bitcode - bit_code_ptr, // input data which is null terminated - bit_code_size, // size of the input data - "a", // optional name for this input - 0, // size of the options - 0, // Array of options applied to this input - 0); // Array of option values cast to void* -``` - -```cpp -hiprtcLinkAddFile(rtc_link_state, // HIPRTC link state - input_type, // type of the input data or bitcode - bc_file_path.c_str(), // path to the input file where bitcode is present - 0, // size of the options - 0, // Array of options applied to this input - 0); // Array of option values cast to void* -``` - -Once the bitcodes for multiple architectures are added to the link instance, the linking of the device code must be completed using `hiprtcLinkComplete` which generates the final binary. - -```cpp -hiprtcLinkComplete(rtc_link_state, // HIPRTC link state - &binary, // upon success, points to the output binary - &binarySize); // size of the binary is stored (optional) -``` - -If the `hiprtcLinkComplete` returns successfully, the generated binary can be loaded and run using the `hipModule*` APIs. - -```cpp -hipModuleLoadData(&module, binary); -``` - -#### Note - -* The compiled binary must be loaded before HIPRTC link instance is destroyed using the `hiprtcLinkDestroy` API. - -```cpp -hiprtcLinkDestroy(rtc_link_state); -``` - -* The correct sequence of calls is : `hiprtcLinkCreate`, `hiprtcLinkAddData` or `hiprtcLinkAddFile`, `hiprtcLinkComplete`, `hiprtcModuleLoadData`, `hiprtcLinkDestroy`. - -### Input Types - -HIPRTC provides `hiprtcJITInputType` enumeration type which defines the input types accepted by the Linker APIs. Here are the `enum` values of `hiprtcJITInputType`. However only the input types `HIPRTC_JIT_INPUT_LLVM_BITCODE`, `HIPRTC_JIT_INPUT_LLVM_BUNDLED_BITCODE` and `HIPRTC_JIT_INPUT_LLVM_ARCHIVES_OF_BUNDLED_BITCODE` are supported currently. - -`HIPRTC_JIT_INPUT_LLVM_BITCODE` can be used to load both LLVM bitcode or LLVM IR assembly code. However, `HIPRTC_JIT_INPUT_LLVM_BUNDLED_BITCODE` and `HIPRTC_JIT_INPUT_LLVM_ARCHIVES_OF_BUNDLED_BITCODE` are only for bundled bitcode and archive of bundled bitcode. - -```cpp -HIPRTC_JIT_INPUT_CUBIN = 0, -HIPRTC_JIT_INPUT_PTX, -HIPRTC_JIT_INPUT_FATBINARY, -HIPRTC_JIT_INPUT_OBJECT, -HIPRTC_JIT_INPUT_LIBRARY, -HIPRTC_JIT_INPUT_NVVM, -HIPRTC_JIT_NUM_LEGACY_INPUT_TYPES, -HIPRTC_JIT_INPUT_LLVM_BITCODE = 100, -HIPRTC_JIT_INPUT_LLVM_BUNDLED_BITCODE = 101, -HIPRTC_JIT_INPUT_LLVM_ARCHIVES_OF_BUNDLED_BITCODE = 102, -HIPRTC_JIT_NUM_INPUT_TYPES = (HIPRTC_JIT_NUM_LEGACY_INPUT_TYPES + 3) -``` - -### Backward Compatibility of LLVM Bitcode/IR - -For HIP applications utilizing HIPRTC to compile LLVM bitcode/IR, compatibility is assured only when the ROCm or HIP SDK version used for generating the LLVM bitcode/IR matches the version used during the runtime compilation. When an application requires the ingestion of bitcode/IR not derived from the currently installed AMD compiler, it must run with HIPRTC and comgr dynamic libraries that are compatible with the version of the bitcode/IR. - -comgr, a shared library, incorporates the LLVM/Clang compiler that HIPRTC relies on. To identify the bitcode/IR version that comgr is compatible with, one can execute "clang -v" using the clang binary from the same ROCm or HIP SDK package. For instance, if compiling bitcode/IR version 14, the HIPRTC and comgr libraries released by AMD around mid 2022 would be the best choice, assuming the LLVM/Clang version included in the package is also version 14. - -To ensure smooth operation and compatibility, an application may choose to ship the specific versions of HIPRTC and comgr dynamic libraries, or it may opt to clearly specify the version requirements and dependencies. This approach guarantees that the application can correctly compile the specified version of bitcode/IR. - -### Link Options - -* `HIPRTC_JIT_IR_TO_ISA_OPT_EXT` - AMD Only. Options to be passed on to link step of compiler by `hiprtcLinkCreate`. -* `HIPRTC_JIT_IR_TO_ISA_OPT_COUNT_EXT` - AMD Only. Count of options passed on to link step of compiler. - -Example: - -```cpp -const char* isaopts[] = {"-mllvm", "-inline-threshold=1", "-mllvm", "-inlinehint-threshold=1"}; -std::vector jit_options = {HIPRTC_JIT_IR_TO_ISA_OPT_EXT, - HIPRTC_JIT_IR_TO_ISA_OPT_COUNT_EXT}; -size_t isaoptssize = 4; -const void* lopts[] = {(void*)isaopts, (void*)(isaoptssize)}; -hiprtcLinkState linkstate; -hiprtcLinkCreate(2, jit_options.data(), (void**)lopts, &linkstate); -``` - -## Error Handling - -HIPRTC defines the `hiprtcResult` enumeration type and a function `hiprtcGetErrorString` for API call error handling. `hiprtcResult` `enum` defines the API result codes. HIPRTC APIs return `hiprtcResult` to indicate the call result. `hiprtcGetErrorString` function returns a string describing the given `hiprtcResult` code, e.g., HIPRTC_SUCCESS to "HIPRTC_SUCCESS". For unrecognized enumeration values, it returns "Invalid HIPRTC error code". - -`hiprtcResult` `enum` supported values and the `hiprtcGetErrorString` usage are mentioned below. - -```cpp -HIPRTC_SUCCESS = 0, -HIPRTC_ERROR_OUT_OF_MEMORY = 1, -HIPRTC_ERROR_PROGRAM_CREATION_FAILURE = 2, -HIPRTC_ERROR_INVALID_INPUT = 3, -HIPRTC_ERROR_INVALID_PROGRAM = 4, -HIPRTC_ERROR_INVALID_OPTION = 5, -HIPRTC_ERROR_COMPILATION = 6, -HIPRTC_ERROR_LINKING = 7, -HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE = 8, -HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 9, -HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 10, -HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 11, -HIPRTC_ERROR_INTERNAL_ERROR = 12 -``` - -```cpp -hiprtcResult result; -result = hiprtcCompileProgram(prog, 1, opts); -if (result != HIPRTC_SUCCESS) { -std::cout << "hiprtcCompileProgram fails with error " << hiprtcGetErrorString(result); -} -``` - -## HIPRTC General APIs - -HIPRTC provides the following API for querying the version. - -`hiprtcVersion(int* major, int* minor)` - This sets the output parameters major and minor with the HIP Runtime compilation major version and minor version number respectively. - -Currently, it returns hardcoded value. This should be implemented to return HIP runtime major and minor version in the future releases. - -## Lowered Names (Mangled Names) - -HIPRTC mangles the `__global__` function names and names of `__device__` and `__constant__` variables. If the generated binary is being loaded using the HIP Runtime API, the kernel function or `__device__/__constant__` variable must be looked up by name, but this is very hard when the name has been mangled. To overcome this, HIPRTC provides API functions that map `__global__` function or `__device__/__constant__` variable names in the source to the mangled names present in the generated binary. - -The two APIs `hiprtcAddNameExpression` and `hiprtcGetLoweredName` provide this functionality. First, a 'name expression' string denoting the address for the `__global__` function or `__device__/__constant__` variable is provided to `hiprtcAddNameExpression`. Then, the program is compiled with `hiprtcCompileProgram`. During compilation, HIPRTC will parse the name expression string as a C++ constant expression at the end of the user program. Finally, the function `hiprtcGetLoweredName` is called with the original name expression and it returns a pointer to the lowered name. The lowered name can be used to refer to the kernel or variable in the HIP Runtime API. - -### Note - -* The identical name expression string must be provided on a subsequent call to `hiprtcGetLoweredName` to extract the lowered name. -* The correct sequence of calls is : `hiprtcAddNameExpression`, `hiprtcCompileProgram`, `hiprtcGetLoweredName`, `hiprtcDestroyProgram`. -* The lowered names must be fetched using `hiprtcGetLoweredName` only after the HIPRTC program has been compiled, and before it has been destroyed. - -### Example - -kernel containing various definitions `__global__` functions/function templates and `__device__/__constant__` variables can be stored in a string. - -```cpp -static constexpr const char gpu_program[] { -R"( -__device__ int V1; // set from host code -static __global__ void f1(int *result) { *result = V1 + 10; } -namespace N1 { -namespace N2 { -__constant__ int V2; // set from host code -__global__ void f2(int *result) { *result = V2 + 20; } -} -} -template -__global__ void f3(int *result) { *result = sizeof(T); } -)"}; -``` - -`hiprtcAddNameExpression` is called with various name expressions referring to the address of `__global__` functions and `__device__/__constant__` variables. - -```cpp -kernel_name_vec.push_back("&f1"); -kernel_name_vec.push_back("N1::N2::f2"); -kernel_name_vec.push_back("f3"); -for (auto&& x : kernel_name_vec) hiprtcAddNameExpression(prog, x.c_str()); -variable_name_vec.push_back("&V1"); -variable_name_vec.push_back("&N1::N2::V2"); -for (auto&& x : variable_name_vec) hiprtcAddNameExpression(prog, x.c_str()); -``` - -After which, the program is compiled using `hiprtcCompileProgram` and the generated binary is loaded using `hipModuleLoadData`. And the mangled names can be fetched using `hirtcGetLoweredName`. - -```cpp -for (decltype(variable_name_vec.size()) i = 0; i != variable_name_vec.size(); ++i) { - const char* name; - hiprtcGetLoweredName(prog, variable_name_vec[i].c_str(), &name); -} -``` - -```cpp -for (decltype(kernel_name_vec.size()) i = 0; i != kernel_name_vec.size(); ++i) { - const char* name; - hiprtcGetLoweredName(prog, kernel_name_vec[i].c_str(), &name); -} -``` - -The mangled name of the variables are used to look up the variable in the module and update its value. - -```cpp -hipDeviceptr_t variable_addr; -size_t bytes{}; -hipModuleGetGlobal(&variable_addr, &bytes, module, name); -hipMemcpyHtoD(variable_addr, &initial_value, sizeof(initial_value)); -``` - -Finally, the mangled name of the kernel is used to launch it using the `hipModule` APIs. - -```cpp -hipFunction_t kernel; -hipModuleGetFunction(&kernel, module, name); -hipModuleLaunchKernel(kernel, 1, 1, 1, 1, 1, 1, 0, nullptr, nullptr, config); -``` - -Please have a look at `hiprtcGetLoweredName.cpp` for the detailed example. - -## Versioning - -HIPRTC follows the below versioning. - -* Linux - * HIPRTC follows the same versioning as HIP runtime library. - * The `so` name field for the shared library is set to MAJOR version. For example, for HIP 5.3 the `so` name is set to 5 (`hiprtc.so.5`). -* Windows - * HIPRTC dll is named as `hiprtcXXYY.dll` where XX is MAJOR version and YY is MINOR version. For example, for HIP 5.3 the name is `hiprtc0503.dll`. - -## HIP header support - -* Added HIPRTC support for all the hip common header files such as library_types.h, hip_math_constants.h, hip_complex.h, math_functions.h, surface_types.h etc. from 6.1. HIPRTC users need not include any HIP macros or constants explicitly in their header files. All of these should get included via HIPRTC builtins when the app links to HIPRTC library. - -## Deprecation notice - -* Currently HIPRTC APIs are separated from HIP APIs and HIPRTC is available as a separate library `libhiprtc.so`/`libhiprtc.dll`. But on Linux, HIPRTC symbols are also present in `libamdhip64.so` in order to support the existing applications. Gradually, these symbols will be removed from HIP library and applications using HIPRTC will be required to explicitly link to HIPRTC library. However, on Windows `hiprtc.dll` must be used as the `amdhip64.dll` doesn't contain the HIPRTC symbols. -* Data types such as `uint32_t`, `uint64_t`, `int32_t`, `int64_t` defined in std namespace in HIPRTC are deprecated earlier and are being removed from ROCm release 6.1 since these can conflict with the standard C++ data types. These data types are now prefixed with `__hip__`, e.g. `__hip_uint32_t`. Applications previously using `std::uint32_t` or similar types can use `__hip_` prefixed types to avoid conflicts with standard std namespace or application can have their own definitions for these types. Also, type_traits templates previously defined in std namespace are moved to `__hip_internal` namespace as implementation details. diff --git a/docs/how-to/hip_rtc.rst b/docs/how-to/hip_rtc.rst new file mode 100644 index 0000000000..734bf60284 --- /dev/null +++ b/docs/how-to/hip_rtc.rst @@ -0,0 +1,726 @@ +.. meta:: + :description: HIP runtime compiler (RTC) + :keywords: AMD, ROCm, HIP, CUDA, RTC, HIP runtime compiler + +.. _hip_runtime_compiler_how-to: + +******************************************************************************* +Programming for HIP runtime compiler (RTC) +******************************************************************************* + +HIP supports the kernels compilation at runtime with the ``hiprtc*`` APIs. +Kernels can be stored as a text string and can be passed to HIPRTC APIs +alongside options to guide the compilation. + +.. note:: + + * This library can be used for compilation on systems without AMD GPU drivers + installed (offline compilation). However, running the compiled code still + requires both the HIP runtime library and GPU drivers on the target system. + * This library depends on Code Object Manager (comgr). You can try to + statically link comgr into HIPRTC to avoid ambiguity. + * Developers can bundle this library with their application. + +Compilation APIs +=============================================================================== + +To use HIPRTC functionality the header needs to be included: + +.. code-block:: cpp + + #include + +Kernels can be stored in a string: + +.. code-block:: cpp + + static constexpr auto kernel_source { + R"( + extern "C" + __global__ void vector_add(float* output, float* input1, float* input2, size_t size) { + int i = threadIdx.x; + if (i < size) { + output[i] = input1[i] + input2[i]; + } + } + )"}; + +To compile this kernel, it needs to be associated with +:cpp:struct:`hiprtcProgram` type, which is done by declaring :code:`hiprtcProgram prog;` +and associating the string of kernel with this program: + +.. code-block:: cpp + + hiprtcCreateProgram(&prog, // HIPRTC program handle + kernel_source, // HIP kernel source string + "vector_add.cpp", // Name of the HIP program, can be null or an empty string + 0, // Number of headers + NULL, // Header sources + NULL); // Name of header files + +:cpp:func:`hiprtcCreateProgram` API also allows you to add headers which can be +included in your RTC program. For online compilation, the compiler pre-defines +HIP device API functions, HIP specific types and macros for device compilation, +but doesn't include standard C/C++ headers by default. Users can only include +header files provided to :cpp:func:`hiprtcCreateProgram`. + +After associating the kernel string with :cpp:struct:`hiprtcProgram`, you can +now compile this program using: + +.. code-block:: cpp + + hiprtcCompileProgram(prog, // hiprtcProgram + 0, // Number of options + options); // Clang Options [Supported Clang Options](clang_options.md) + +:cpp:func:`hiprtcCompileProgram` returns a status value which can be converted +to string via :cpp:func:`hiprtcGetErrorString`. If compilation is successful, +:cpp:func:`hiprtcCompileProgram` will return ``HIPRTC_SUCCESS``. + +if the compilation fails or produces warnings, you can look up the logs via: + +.. code-block:: cpp + + size_t logSize; + hiprtcGetProgramLogSize(prog, &logSize); + + if (logSize) { + string log(logSize, '\0'); + hiprtcGetProgramLog(prog, &log[0]); + // Corrective action with logs + } + +If the compilation is successful, you can load the compiled binary in a local +variable. + +.. code-block:: cpp + + size_t codeSize; + hiprtcGetCodeSize(prog, &codeSize); + + vector kernel_binary(codeSize); + hiprtcGetCode(prog, kernel_binary.data()); + +After loading the binary, :cpp:struct:`hiprtcProgram` can be destroyed. +:code:`hiprtcDestroyProgram(&prog);` + +The binary present in ``kernel_binary`` can now be loaded via +:cpp:func:`hipModuleLoadData` API. + +.. code-block:: cpp + + hipModule_t module; + hipFunction_t kernel; + + hipModuleLoadData(&module, kernel_binary.data()); + hipModuleGetFunction(&kernel, module, "vector_add"); + +And now this kernel can be launched via ``hipModule`` APIs. + +The full example is below: + +.. code-block:: cpp + + #include + #include + + #include + #include + #include + + #define CHECK_RET_CODE(call, ret_code) \ + { \ + if ((call) != ret_code) { \ + std::cout << "Failed in call: " << #call << std::endl; \ + std::abort(); \ + } \ + } + #define HIP_CHECK(call) CHECK_RET_CODE(call, hipSuccess) + #define HIPRTC_CHECK(call) CHECK_RET_CODE(call, HIPRTC_SUCCESS) + + // source code for hiprtc + static constexpr auto kernel_source{ + R"( + extern "C" + __global__ void vector_add(float* output, float* input1, float* input2, size_t size) { + int i = threadIdx.x; + if (i < size) { + output[i] = input1[i] + input2[i]; + } + } + )"}; + + int main() { + hiprtcProgram prog; + auto rtc_ret_code = hiprtcCreateProgram(&prog, // HIPRTC program handle + kernel_source, // kernel source string + "vector_add.cpp", // Name of the file + 0, // Number of headers + NULL, // Header sources + NULL); // Name of header file + + if (rtc_ret_code != HIPRTC_SUCCESS) { + std::cout << "Failed to create program" << std::endl; + std::abort(); + } + + hipDeviceProp_t props; + int device = 0; + HIP_CHECK(hipGetDeviceProperties(&props, device)); + std::string sarg = std::string("--gpu-architecture=") + + props.gcnArchName; // device for which binary is to be generated + + const char* options[] = {sarg.c_str()}; + + rtc_ret_code = hiprtcCompileProgram(prog, // hiprtcProgram + 0, // Number of options + options); // Clang Options + if (rtc_ret_code != HIPRTC_SUCCESS) { + std::cout << "Failed to create program" << std::endl; + std::abort(); + } + + size_t logSize; + HIPRTC_CHECK(hiprtcGetProgramLogSize(prog, &logSize)); + + if (logSize) { + std::string log(logSize, '\0'); + HIPRTC_CHECK(hiprtcGetProgramLog(prog, &log[0])); + std::cout << "Compilation failed or produced warnings: " << log << std::endl; + std::abort(); + } + + size_t codeSize; + HIPRTC_CHECK(hiprtcGetCodeSize(prog, &codeSize)); + + std::vector kernel_binary(codeSize); + HIPRTC_CHECK(hiprtcGetCode(prog, kernel_binary.data())); + + HIPRTC_CHECK(hiprtcDestroyProgram(&prog)); + + hipModule_t module; + hipFunction_t kernel; + + HIP_CHECK(hipModuleLoadData(&module, kernel_binary.data())); + HIP_CHECK(hipModuleGetFunction(&kernel, module, "vector_add")); + + constexpr size_t ele_size = 256; // total number of items to add + std::vector hinput, output; + hinput.reserve(ele_size); + output.reserve(ele_size); + for (size_t i = 0; i < ele_size; i++) { + hinput.push_back(static_cast(i + 1)); + output.push_back(0.0f); + } + + float *dinput1, *dinput2, *doutput; + HIP_CHECK(hipMalloc(&dinput1, sizeof(float) * ele_size)); + HIP_CHECK(hipMalloc(&dinput2, sizeof(float) * ele_size)); + HIP_CHECK(hipMalloc(&doutput, sizeof(float) * ele_size)); + + HIP_CHECK(hipMemcpy(dinput1, hinput.data(), sizeof(float) * ele_size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(dinput2, hinput.data(), sizeof(float) * ele_size, hipMemcpyHostToDevice)); + + struct { + float* output; + float* input1; + float* input2; + size_t size; + } args{doutput, dinput1, dinput2, ele_size}; + + auto size = sizeof(args); + void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args, HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, + HIP_LAUNCH_PARAM_END}; + + HIP_CHECK(hipModuleLaunchKernel(kernel, 1, 1, 1, ele_size, 1, 1, 0, nullptr, nullptr, config)); + + HIP_CHECK(hipMemcpy(output.data(), doutput, sizeof(float) * ele_size, hipMemcpyDeviceToHost)); + + for (size_t i = 0; i < ele_size; i++) { + if ((hinput[i] + hinput[i]) != output[i]) { + std::cout << "Failed in validation: " << (hinput[i] + hinput[i]) << " - " << output[i] + << std::endl; + std::abort(); + } + } + std::cout << "Passed" << std::endl; + + HIP_CHECK(hipFree(dinput1)); + HIP_CHECK(hipFree(dinput2)); + HIP_CHECK(hipFree(doutput)); + } + + +Kernel Compilation Cache +=============================================================================== + +HIPRTC incorporates a cache to avoid recompiling kernels between program +executions. The contents of the cache include the kernel source code (including +the contents of any ``#include`` headers), the compilation flags, and the +compiler version. After a ROCm version update, the kernels are progressively +recompiled, and the new results are cached. When the cache is disabled, each +kernel is recompiled every time it is requested. + +Use the following environment variables to manage the cache status as enabled or +disabled, the location for storing the cache contents, and the cache eviction +policy: + +* ``AMD_COMGR_CACHE`` By default this variable has a value of ``0`` and the + compilation cache feature is disabled. To enable the feature set the + environment variable to a value of ``1`` (or any value other than ``0``). + +* ``AMD_COMGR_CACHE_DIR``: By default the value of this environment variable is + defined as ``$XDG_CACHE_HOME/comgr_cache``, which defaults to + ``$USER/.cache/comgr_cache`` on Linux, and ``%LOCALAPPDATA%\cache\comgr_cache`` + on Windows. You can specify a different directory for the environment variable + to change the path for cache storage. If the runtime fails to access the + specified cache directory, or the environment variable is set to an empty + string (""), the cache is disabled. + +* ``AMD_COMGR_CACHE_POLICY``: If assigned a value, the string is interpreted and + applied to the cache pruning policy. The string format is consistent with + `Clang's ThinLTO cache pruning policy `_. + The default policy is defined as: + ``prune_interval=1h:prune_expiration=0h:cache_size=75%:cache_size_bytes=30g:cache_size_files=0``. + If the runtime fails to parse the defined string, or the environment variable + is set to an empty string (""), the cache is disabled. + +.. note:: + + This cache is also shared with the OpenCL runtime shipped with ROCm. + +HIPRTC specific options +=============================================================================== + +HIPRTC provides a few HIPRTC specific flags: + +* ``--gpu-architecture`` : This flag can guide the code object generation for a + specific GPU architecture. Example: + ``--gpu-architecture=gfx906:sramecc+:xnack-``, its equivalent to + ``--offload-arch``. + + * This option is compulsory if compilation is done on a system without AMD + GPUs supported by HIP runtime. + + * Otherwise, HIPRTC will load the hip runtime and gather the current device + and its architecture info and use it as option. + +* ``-fgpu-rdc`` : This flag when provided during the + :cpp:func:`hiprtcCreateProgram` generates the bitcode (HIPRTC doesn't convert + this bitcode into ISA and binary). This bitcode can later be fetched using + :cpp:func:`hiprtcGetBitcode` and :cpp:func:`hiprtcGetBitcodeSize` APIs. + +Bitcode +------------------------------------------------------------------------------- + +In the usual scenario, the kernel associated with :cpp:struct:`hiprtcProgram` is +compiled into the binary which can be loaded and run. However, if ``-fgpu-rdc`` +option is provided in the compile options, HIPRTC calls comgr and generates only +the LLVM bitcode. It doesn't convert this bitcode to ISA and generate the final +binary. + +.. code-block:: cpp + + std::string sarg = std::string("-fgpu-rdc"); + const char* options[] = { + sarg.c_str() }; + hiprtcCompileProgram(prog, // hiprtcProgram + 1, // Number of options + options); + +If the compilation is successful, one can load the bitcode in a local variable +using the bitcode APIs provided by HIPRTC. + +.. code-block:: cpp + + size_t bitCodeSize; + hiprtcGetBitcodeSize(prog, &bitCodeSize); + + vector kernel_bitcode(bitCodeSize); + hiprtcGetBitcode(prog, kernel_bitcode.data()); + +CU Mode vs WGP mode +------------------------------------------------------------------------------- + +AMD GPUs consist of an array of workgroup processors, each built with 2 compute +units (CUs) capable of executing SIMD32. All the CUs inside a workgroup +processor use local data share (LDS). + +gfx10+ support execution of wavefront in CU mode and work-group processor mode +(WGP). Please refer to section 2.3 of `RDNA3 ISA reference `_. + +gfx9 and below only supports CU mode. + +In WGP mode, 4 warps of a block can simultaneously be executed on the workgroup +processor, where as in CU mode only 2 warps of a block can simultaneously +execute on a CU. In theory, WGP mode might help with occupancy and increase the +performance of certain HIP programs (if not bound to inter warp communication), +but might incur performance penalty on other HIP programs which rely on atomics +and inter warp communication. This also has effect of how the LDS is split +between warps, please refer to `RDNA3 ISA reference `_ for more information. + +.. note:: + + HIPRTC assumes **WGP mode by default** for gfx10+. This can be overridden by + passing ``-mcumode`` to HIPRTC compile options in + :cpp:func:`hiprtcCompileProgram`. + +Linker APIs +=============================================================================== + +The bitcode generated using the HIPRTC Bitcode APIs can be loaded using +``hipModule`` APIs and also can be linked with other generated bitcodes with +appropriate linker flags using the HIPRTC linker APIs. This also provides more +flexibility and optimizations to the applications who want to generate the +binary dynamically according to their needs. The input bitcodes can be generated +only for a specific architecture or it can be a bundled bitcode which is +generated for multiple architectures. + +Example +------------------------------------------------------------------------------- + +Firstly, HIPRTC link instance or a pending linker invocation must be created +using :cpp:func:`hiprtcLinkCreate`, with the appropriate linker options +provided. + +.. code-block:: cpp + + hiprtcLinkCreate( num_options, // number of options + options, // Array of options + option_vals, // Array of option values cast to void* + &rtc_link_state ); // HIPRTC link state created upon success + +Following which, the bitcode data can be added to this link instance via +:cpp:func:`hiprtcLinkAddData` (if the data is present as a string) or +:cpp:func:`hiprtcLinkAddFile` (if the data is present as a file) with the +appropriate input type according to the data or the bitcode used. + +.. code-block:: cpp + + hiprtcLinkAddData(rtc_link_state, // HIPRTC link state + input_type, // type of the input data or bitcode + bit_code_ptr, // input data which is null terminated + bit_code_size, // size of the input data + "a", // optional name for this input + 0, // size of the options + 0, // Array of options applied to this input + 0); // Array of option values cast to void* + +.. code-block:: cpp + + hiprtcLinkAddFile(rtc_link_state, // HIPRTC link state + input_type, // type of the input data or bitcode + bc_file_path.c_str(), // path to the input file where bitcode is present + 0, // size of the options + 0, // Array of options applied to this input + 0); // Array of option values cast to void* + +Once the bitcodes for multiple architectures are added to the link instance, the +linking of the device code must be completed using :cpp:func:`hiprtcLinkComplete` +which generates the final binary. + +.. code-block:: cpp + + hiprtcLinkComplete(rtc_link_state, // HIPRTC link state + &binary, // upon success, points to the output binary + &binarySize); // size of the binary is stored (optional) + +If the :cpp:func:`hiprtcLinkComplete` returns successfully, the generated binary +can be loaded and run using the ``hipModule*`` APIs. + +.. code-block:: cpp + + hipModuleLoadData(&module, binary); + +.. note:: + + * The compiled binary must be loaded before HIPRTC link instance is destroyed + using the :cpp:func:`hiprtcLinkDestroy` API. + + .. code-block:: cpp + + hiprtcLinkDestroy(rtc_link_state); + + * The correct sequence of calls is : :cpp:func:`hiprtcLinkCreate`, + :cpp:func:`hiprtcLinkAddData` or :cpp:func:`hiprtcLinkAddFile`, + :cpp:func:`hiprtcLinkComplete`, :cpp:func:`hipModuleLoadData`, + :cpp:func:`hiprtcLinkDestroy`. + +Input Types +------------------------------------------------------------------------------- + +HIPRTC provides ``hiprtcJITInputType`` enumeration type which defines the input +types accepted by the Linker APIs. Here are the ``enum`` values of +``hiprtcJITInputType``. However only the input types +``HIPRTC_JIT_INPUT_LLVM_BITCODE``, ``HIPRTC_JIT_INPUT_LLVM_BUNDLED_BITCODE`` and +``HIPRTC_JIT_INPUT_LLVM_ARCHIVES_OF_BUNDLED_BITCODE`` are supported currently. + +``HIPRTC_JIT_INPUT_LLVM_BITCODE`` can be used to load both LLVM bitcode or LLVM +IR assembly code. However, ``HIPRTC_JIT_INPUT_LLVM_BUNDLED_BITCODE`` and +``HIPRTC_JIT_INPUT_LLVM_ARCHIVES_OF_BUNDLED_BITCODE`` are only for bundled +bitcode and archive of bundled bitcode. + +.. code-block:: cpp + + HIPRTC_JIT_INPUT_CUBIN = 0, + HIPRTC_JIT_INPUT_PTX, + HIPRTC_JIT_INPUT_FATBINARY, + HIPRTC_JIT_INPUT_OBJECT, + HIPRTC_JIT_INPUT_LIBRARY, + HIPRTC_JIT_INPUT_NVVM, + HIPRTC_JIT_NUM_LEGACY_INPUT_TYPES, + HIPRTC_JIT_INPUT_LLVM_BITCODE = 100, + HIPRTC_JIT_INPUT_LLVM_BUNDLED_BITCODE = 101, + HIPRTC_JIT_INPUT_LLVM_ARCHIVES_OF_BUNDLED_BITCODE = 102, + HIPRTC_JIT_NUM_INPUT_TYPES = (HIPRTC_JIT_NUM_LEGACY_INPUT_TYPES + 3) + +Backward Compatibility of LLVM Bitcode/IR +------------------------------------------------------------------------------- + +For HIP applications utilizing HIPRTC to compile LLVM bitcode/IR, compatibility +is assured only when the ROCm or HIP SDK version used for generating the LLVM +bitcode/IR matches the version used during the runtime compilation. When an +application requires the ingestion of bitcode/IR not derived from the currently +installed AMD compiler, it must run with HIPRTC and comgr dynamic libraries that +are compatible with the version of the bitcode/IR. + +`Comgr `_ is a +shared library that incorporates the LLVM/Clang compiler that HIPRTC relies on. +To identify the bitcode/IR version that comgr is compatible with, one can +execute "clang -v" using the clang binary from the same ROCm or HIP SDK package. +For instance, if compiling bitcode/IR version 14, the HIPRTC and comgr libraries +released by AMD around mid 2022 would be the best choice, assuming the +LLVM/Clang version included in the package is also version 14. + +To ensure smooth operation and compatibility, an application may choose to ship +the specific versions of HIPRTC and comgr dynamic libraries, or it may opt to +clearly specify the version requirements and dependencies. This approach +guarantees that the application can correctly compile the specified version of +bitcode/IR. + +Link Options +------------------------------------------------------------------------------- + +* ``HIPRTC_JIT_IR_TO_ISA_OPT_EXT`` - AMD Only. Options to be passed on to link + step of compiler by :cpp:func:`hiprtcLinkCreate`. + +* ``HIPRTC_JIT_IR_TO_ISA_OPT_COUNT_EXT`` - AMD Only. Count of options passed on + to link step of compiler. + +Example: + +.. code-block:: cpp + + const char* isaopts[] = {"-mllvm", "-inline-threshold=1", "-mllvm", "-inlinehint-threshold=1"}; + std::vector jit_options = {HIPRTC_JIT_IR_TO_ISA_OPT_EXT, + HIPRTC_JIT_IR_TO_ISA_OPT_COUNT_EXT}; + size_t isaoptssize = 4; + const void* lopts[] = {(void*)isaopts, (void*)(isaoptssize)}; + hiprtcLinkState linkstate; + hiprtcLinkCreate(2, jit_options.data(), (void**)lopts, &linkstate); + +Error Handling +=============================================================================== + +HIPRTC defines the ``hiprtcResult`` enumeration type and a function +:cpp:func:`hiprtcGetErrorString` for API call error handling. ``hiprtcResult`` +``enum`` defines the API result codes. HIPRTC APIs return ``hiprtcResult`` to +indicate the call result. :cpp:func:`hiprtcGetErrorString` function returns a +string describing the given ``hiprtcResult`` code, for example HIPRTC_SUCCESS to +"HIPRTC_SUCCESS". For unrecognized enumeration values, it returns +"Invalid HIPRTC error code". + +``hiprtcResult`` ``enum`` supported values and the +:cpp:func:`hiprtcGetErrorString` usage are mentioned below. + +.. code-block:: cpp + + HIPRTC_SUCCESS = 0, + HIPRTC_ERROR_OUT_OF_MEMORY = 1, + HIPRTC_ERROR_PROGRAM_CREATION_FAILURE = 2, + HIPRTC_ERROR_INVALID_INPUT = 3, + HIPRTC_ERROR_INVALID_PROGRAM = 4, + HIPRTC_ERROR_INVALID_OPTION = 5, + HIPRTC_ERROR_COMPILATION = 6, + HIPRTC_ERROR_LINKING = 7, + HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE = 8, + HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 9, + HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 10, + HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 11, + HIPRTC_ERROR_INTERNAL_ERROR = 12 + +.. code-block:: cpp + + hiprtcResult result; + result = hiprtcCompileProgram(prog, 1, opts); + if (result != HIPRTC_SUCCESS) { + std::cout << "hiprtcCompileProgram fails with error " << hiprtcGetErrorString(result); + } + +HIPRTC General APIs +=============================================================================== + +HIPRTC provides ``hiprtcVersion(int* major, int* minor)`` for querying the +version. This sets the output parameters major and minor with the HIP Runtime +compilation major version and minor version number respectively. + +Currently, it returns hardcoded values. This should be implemented to return HIP +runtime major and minor version in the future releases. + +Lowered Names (Mangled Names) +=============================================================================== + +HIPRTC mangles the ``__global__`` function names and names of ``__device__`` and +``__constant__`` variables. If the generated binary is being loaded using the +HIP Runtime API, the kernel function or ``__device__/__constant__`` variable +must be looked up by name, but this is very hard when the name has been mangled. +To overcome this, HIPRTC provides API functions that map ``__global__`` function +or ``__device__/__constant__`` variable names in the source to the mangled names +present in the generated binary. + +The two APIs :cpp:func:`hiprtcAddNameExpression` and +:cpp:func:`hiprtcGetLoweredName` provide this functionality. First, a 'name +expression' string denoting the address for the ``__global__`` function or +``__device__/__constant__`` variable is provided to +:cpp:func:`hiprtcAddNameExpression`. Then, the program is compiled with +:cpp:func:`hiprtcCreateProgram`. During compilation, HIPRTC will parse the name +expression string as a C++ constant expression at the end of the user program. +Finally, the function :cpp:func:`hiprtcGetLoweredName` is called with the +original name expression and it returns a pointer to the lowered name. The +lowered name can be used to refer to the kernel or variable in the HIP Runtime +API. + +.. note:: + + * The identical name expression string must be provided on a subsequent call + to :cpp:func:`hiprtcGetLoweredName` to extract the lowered name. + + * The correct sequence of calls is : :cpp:func:`hiprtcAddNameExpression`, + :cpp:func:`hiprtcCreateProgram`, :cpp:func:`hiprtcGetLoweredName`, + :cpp:func:`hiprtcDestroyProgram`. + + * The lowered names must be fetched using :cpp:func:`hiprtcGetLoweredName` + only after the HIPRTC program has been compiled, and before it has been + destroyed. + +Example +------------------------------------------------------------------------------- + +Kernel containing various definitions ``__global__`` functions/function +templates and ``__device__/__constant__`` variables can be stored in a string. + +.. code-block:: cpp + + static constexpr const char gpu_program[] { + R"( + __device__ int V1; // set from host code + static __global__ void f1(int *result) { *result = V1 + 10; } + namespace N1 { + namespace N2 { + __constant__ int V2; // set from host code + __global__ void f2(int *result) { *result = V2 + 20; } + } + } + template + __global__ void f3(int *result) { *result = sizeof(T); } + )"}; + +:cpp:func:`hiprtcAddNameExpression` is called with various name expressions +referring to the address of ``__global__`` functions and +``__device__/__constant__`` variables. + +.. code-block:: cpp + + kernel_name_vec.push_back("&f1"); + kernel_name_vec.push_back("N1::N2::f2"); + kernel_name_vec.push_back("f3"); + for (auto&& x : kernel_name_vec) hiprtcAddNameExpression(prog, x.c_str()); + variable_name_vec.push_back("&V1"); + variable_name_vec.push_back("&N1::N2::V2"); + for (auto&& x : variable_name_vec) hiprtcAddNameExpression(prog, x.c_str()); + +After which, the program is compiled using :cpp:func:`hiprtcCompileProgram`, the +generated binary is loaded using :cpp:func:`hipModuleLoadData`, and the mangled +names can be fetched using :cpp:func:`hirtcGetLoweredName`. + +.. code-block:: cpp + + for (decltype(variable_name_vec.size()) i = 0; i != variable_name_vec.size(); ++i) { + const char* name; + hiprtcGetLoweredName(prog, variable_name_vec[i].c_str(), &name); + } + +.. code-block:: cpp + + for (decltype(kernel_name_vec.size()) i = 0; i != kernel_name_vec.size(); ++i) { + const char* name; + hiprtcGetLoweredName(prog, kernel_name_vec[i].c_str(), &name); + } + +The mangled name of the variables are used to look up the variable in the module +and update its value. + +.. code-block:: cpp + + hipDeviceptr_t variable_addr; + size_t bytes{}; + hipModuleGetGlobal(&variable_addr, &bytes, module, name); + hipMemcpyHtoD(variable_addr, &initial_value, sizeof(initial_value)); + + +Finally, the mangled name of the kernel is used to launch it using the +``hipModule`` APIs. + +.. code-block:: cpp + + hipFunction_t kernel; + hipModuleGetFunction(&kernel, module, name); + hipModuleLaunchKernel(kernel, 1, 1, 1, 1, 1, 1, 0, nullptr, nullptr, config); + +Versioning +=============================================================================== + +HIPRTC uses the following versioning: + +* Linux + + * HIPRTC follows the same versioning as HIP runtime library. + * The ``so`` name field for the shared library is set to MAJOR version. For + example, for HIP 5.3 the ``so`` name is set to 5 (``hiprtc.so.5``). + +* Windows + + * HIPRTC dll is named as ``hiprtcXXYY.dll`` where ``XX`` is MAJOR version and + ``YY`` is MINOR version. For example, for HIP 5.3 the name is + ``hiprtc0503.dll``. + +HIP header support +=============================================================================== + +Added HIPRTC support for all the hip common header files such as +``library_types.h``, ``hip_math_constants.h``, ``hip_complex.h``, +``math_functions.h``, ``surface_types.h`` etc. from 6.1. HIPRTC users need not +include any HIP macros or constants explicitly in their header files. All of +these should get included via HIPRTC builtins when the app links to HIPRTC +library. + +Deprecation notice +=============================================================================== + +* Currently HIPRTC APIs are separated from HIP APIs and HIPRTC is available as a + separate library ``libhiprtc.so``/ ``libhiprtc.dll``. But on Linux, HIPRTC + symbols are also present in ``libamdhip64.so`` in order to support the + existing applications. Gradually, these symbols will be removed from HIP + library and applications using HIPRTC will be required to explicitly link to + HIPRTC library. However, on Windows ``hiprtc.dll`` must be used as the + ``amdhip64.dll`` doesn't contain the HIPRTC symbols. + +* Data types such as ``uint32_t``, ``uint64_t``, ``int32_t``, ``int64_t`` + defined in std namespace in HIPRTC are deprecated earlier and are being + removed from ROCm release 6.1 since these can conflict with the standard + C++ data types. These data types are now prefixed with ``__hip__``, for example + ``__hip_uint32_t``. Applications previously using ``std::uint32_t`` or similar + types can use ``__hip_`` prefixed types to avoid conflicts with standard std + namespace or application can have their own definitions for these types. Also, + type_traits templates previously defined in std namespace are moved to + ``__hip_internal`` namespace as implementation details. From 13b42d888b02dc06d166454ee44b45b4d31b6d1d Mon Sep 17 00:00:00 2001 From: Matthias Knorr Date: Mon, 13 Jan 2025 13:07:51 +0100 Subject: [PATCH 04/32] Docs: Refactor HIP porting guide --- .wordlist.txt | 7 +- docs/how-to/hip_cpp_language_extensions.rst | 2 +- docs/how-to/hip_porting_guide.md | 582 ------------------- docs/how-to/hip_porting_guide.rst | 604 ++++++++++++++++++++ docs/how-to/logging.rst | 13 + docs/understand/compilers.rst | 75 +++ 6 files changed, 695 insertions(+), 588 deletions(-) delete mode 100644 docs/how-to/hip_porting_guide.md create mode 100644 docs/how-to/hip_porting_guide.rst diff --git a/.wordlist.txt b/.wordlist.txt index 32d489abc8..de7c91b31a 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -24,7 +24,8 @@ coroutines Ctx cuBLASLt cuCtx -CUDA's +CUDA +cuda cuDNN cuModule dataflow @@ -35,7 +36,6 @@ Dereferencing dll DirectX EIGEN -EIGEN's enqueue enqueues entrypoint @@ -61,7 +61,6 @@ hardcoded HC hcBLAS HIP-Clang -HIP's hipcc hipCtx hipexamine @@ -142,7 +141,6 @@ quad representable RMW rocgdb -ROCm's rocTX roundtrip rst @@ -158,7 +156,6 @@ sinewave SOMA SPMV structs -struct's SYCL syntaxes texel diff --git a/docs/how-to/hip_cpp_language_extensions.rst b/docs/how-to/hip_cpp_language_extensions.rst index aa993541e4..1c55955b5d 100644 --- a/docs/how-to/hip_cpp_language_extensions.rst +++ b/docs/how-to/hip_cpp_language_extensions.rst @@ -469,7 +469,7 @@ compile-time constant on the host. It has to be queried using applications. NVIDIA devices return 32 for this variable; AMD devices return 64 for gfx9 and 32 for gfx10 and above. While code that assumes a ``warpSize`` of 32 can run on devices with a ``warpSize`` of 64, it only utilizes half of - the the compute resources. + the compute resources. ******************************************************************************** Vector types diff --git a/docs/how-to/hip_porting_guide.md b/docs/how-to/hip_porting_guide.md deleted file mode 100644 index a6027d4801..0000000000 --- a/docs/how-to/hip_porting_guide.md +++ /dev/null @@ -1,582 +0,0 @@ - - - - - - -# HIP porting guide - -In addition to providing a portable C++ programming environment for GPUs, HIP is designed to ease -the porting of existing CUDA code into the HIP environment. This section describes the available tools -and provides practical suggestions on how to port CUDA code and work through common issues. - -## Porting a New CUDA Project - -### General Tips - -* Starting the port on a CUDA machine is often the easiest approach, since you can incrementally port pieces of the code to HIP while leaving the rest in CUDA. (Recall that on CUDA machines HIP is just a thin layer over CUDA, so the two code types can interoperate on NVCC platforms.) Also, the HIP port can be compared with the original CUDA code for function and performance. -* Once the CUDA code is ported to HIP and is running on the CUDA machine, compile the HIP code using the HIP compiler on an AMD machine. -* HIP ports can replace CUDA versions: HIP can deliver the same performance as a native CUDA implementation, with the benefit of portability to both NVIDIA and AMD architectures as well as a path to future C++ standard support. You can handle platform-specific features through conditional compilation or by adding them to the open-source HIP infrastructure. -* Use **[hipconvertinplace-perl.sh](https://github.com/ROCm/HIPIFY/blob/amd-staging/bin/hipconvertinplace-perl.sh)** to hipify all code files in the CUDA source directory. - -### Scanning existing CUDA code to scope the porting effort - -The **[hipexamine-perl.sh](https://github.com/ROCm/HIPIFY/blob/amd-staging/bin/hipexamine-perl.sh)** tool will scan a source directory to determine which files contain CUDA code and how much of that code can be automatically hipified. - -```shell -> cd examples/rodinia_3.0/cuda/kmeans -> $HIP_DIR/bin/hipexamine-perl.sh. -info: hipify ./kmeans.h =====> -info: hipify ./unistd.h =====> -info: hipify ./kmeans.c =====> -info: hipify ./kmeans_cuda_kernel.cu =====> - info: converted 40 CUDA->HIP refs( dev:0 mem:0 kern:0 builtin:37 math:0 stream:0 event:0 err:0 def:0 tex:3 other:0 ) warn:0 LOC:185 -info: hipify ./getopt.h =====> -info: hipify ./kmeans_cuda.cu =====> - info: converted 49 CUDA->HIP refs( dev:3 mem:32 kern:2 builtin:0 math:0 stream:0 event:0 err:0 def:0 tex:12 other:0 ) warn:0 LOC:311 -info: hipify ./rmse.c =====> -info: hipify ./cluster.c =====> -info: hipify ./getopt.c =====> -info: hipify ./kmeans_clustering.c =====> -info: TOTAL-converted 89 CUDA->HIP refs( dev:3 mem:32 kern:2 builtin:37 math:0 stream:0 event:0 err:0 def:0 tex:15 other:0 ) warn:0 LOC:3607 - kernels (1 total) : kmeansPoint(1) -``` - -hipexamine-perl scans each code file (cpp, c, h, hpp, etc.) found in the specified directory: - -* Files with no CUDA code (`kmeans.h`) print one line summary just listing the source file name. -* Files with CUDA code print a summary of what was found - for example the `kmeans_cuda_kernel.cu` file: - -```shell -info: hipify ./kmeans_cuda_kernel.cu =====> - info: converted 40 CUDA->HIP refs( dev:0 mem:0 kern:0 builtin:37 math:0 stream:0 event:0 -``` - -* Interesting information in `kmeans_cuda_kernel.cu` : - * How many CUDA calls were converted to HIP (40) - * Breakdown of the CUDA functionality used (`dev:0 mem:0` etc). This file uses many CUDA builtins (37) and texture functions (3). - * Warning for code that looks like CUDA API but was not converted (0 in this file). - * Count Lines-of-Code (LOC) - 185 for this file. - -* hipexamine-perl also presents a summary at the end of the process for the statistics collected across all files. This has similar format to the per-file reporting, and also includes a list of all kernels which have been called. An example from above: - -```shell -info: TOTAL-converted 89 CUDA->HIP refs( dev:3 mem:32 kern:2 builtin:37 math:0 stream:0 event:0 err:0 def:0 tex:15 other:0 ) warn:0 LOC:3607 - kernels (1 total) : kmeansPoint(1) -``` - -### Converting a project "in-place" - -```shell -> hipify-perl --inplace -``` - -For each input file FILE, this script will: - -* If `FILE.prehip` file does not exist, copy the original code to a new file with extension `.prehip`. Then hipify the code file. -* If `FILE.prehip` file exists, hipify `FILE.prehip` and save to FILE. - -This is useful for testing improvements to the hipify toolset. - -The [hipconvertinplace-perl.sh](https://github.com/ROCm/HIPIFY/blob/amd-staging/bin/hipconvertinplace-perl.sh) script will perform inplace conversion for all code files in the specified directory. -This can be quite handy when dealing with an existing CUDA code base since the script preserves the existing directory structure -and filenames - and includes work. After converting in-place, you can review the code to add additional parameters to -directory names. - -```shell -> hipconvertinplace-perl.sh MY_SRC_DIR -``` - -### Library Equivalents - -Most CUDA libraries have a corresponding ROCm library with similar functionality and APIs. However, ROCm also provides HIP marshalling libraries that greatly simplify the porting process because they more precisely reflect their CUDA counterparts and can be used with either the AMD or NVIDIA platforms (see "Identifying HIP Target Platform" below). There are a few notable exceptions: - -* MIOpen does not have a marshalling library interface to ease porting from cuDNN. -* RCCL is a drop-in replacement for NCCL and implements the NCCL APIs. -* hipBLASLt does not have a ROCm library but can still target the NVIDIA platform, as needed. -* EIGEN's HIP support is part of the library. - -| CUDA Library | HIP Library | ROCm Library | Comment | -|------------- | ----------- | ------------ | ------- | -| cuBLAS | hipBLAS | rocBLAS | Basic Linear Algebra Subroutines -| cuBLASLt | hipBLASLt | N/A | Basic Linear Algebra Subroutines, lightweight and new flexible API -| cuFFT | hipFFT | rocFFT | Fast Fourier Transfer Library -| cuSPARSE | hipSPARSE | rocSPARSE | Sparse BLAS + SPMV -| cuSOLVER | hipSOLVER | rocSOLVER | Lapack library -| AmgX | N/A | rocALUTION | Sparse iterative solvers and preconditioners with algebraic multigrid -| Thrust | N/A | rocThrust | C++ parallel algorithms library -| CUB | hipCUB | rocPRIM | Low Level Optimized Parallel Primitives -| cuDNN | N/A | MIOpen | Deep learning Solver Library -| cuRAND | hipRAND | rocRAND | Random Number Generator Library -| EIGEN | EIGEN | N/A | C++ template library for linear algebra: matrices, vectors, numerical solvers, -| NCCL | N/A | RCCL | Communications Primitives Library based on the MPI equivalents - -## Distinguishing Compiler Modes - -### Identifying HIP Target Platform - -All HIP projects target either AMD or NVIDIA platform. The platform affects which headers are included and which libraries are used for linking. - -* `__HIP_PLATFORM_AMD__` is defined if the HIP platform targets AMD. -Note, `__HIP_PLATFORM_HCC__` was previously defined if the HIP platform targeted AMD, it is deprecated. -* `__HIP_PLATFORM_NVDIA__` is defined if the HIP platform targets NVIDIA. -Note, `__HIP_PLATFORM_NVCC__` was previously defined if the HIP platform targeted NVIDIA, it is deprecated. - -### Identifying the Compiler: hip-clang or NVCC - -Often, it's useful to know whether the underlying compiler is HIP-Clang or NVCC. This knowledge can guard platform-specific code or aid in platform-specific performance tuning. - -```cpp -#ifdef __HIP_PLATFORM_AMD__ -// Compiled with HIP-Clang -#endif -``` - -```cpp -#ifdef __HIP_PLATFORM_NVIDIA__ -// Compiled with nvcc -// Could be compiling with CUDA language extensions enabled (for example, a ".cu file) -// Could be in pass-through mode to an underlying host compile OR (for example, a .cpp file) - -``` - -```cpp -#ifdef __CUDACC__ -// Compiled with nvcc (CUDA language extensions enabled) -``` - -Compiler directly generates the host code (using the Clang x86 target) and passes the code to another host compiler. Thus, they have no equivalent of the `__CUDACC__` define. - -### Identifying Current Compilation Pass: Host or Device - -NVCC makes two passes over the code: one for host code and one for device code. -HIP-Clang will have multiple passes over the code: one for the host code, and one for each architecture on the device code. -`__HIP_DEVICE_COMPILE__` is set to a nonzero value when the compiler (HIP-Clang or NVCC) is compiling code for a device inside a `__global__` kernel or for a device function. `__HIP_DEVICE_COMPILE__` can replace `#ifdef` checks on the `__CUDA_ARCH__` define. - -```cpp -// #ifdef __CUDA_ARCH__ -#if __HIP_DEVICE_COMPILE__ -``` - -Unlike `__CUDA_ARCH__`, the `__HIP_DEVICE_COMPILE__` value is 1 or undefined, and it doesn't represent the feature capability of the target device. - -### Compiler Defines: Summary - -|Define | HIP-Clang | NVCC | Other (GCC, ICC, Clang, etc.) -|--- | --- | --- |--- | -|HIP-related defines:| -|`__HIP_PLATFORM_AMD__` | Defined | Undefined | Defined if targeting AMD platform; undefined otherwise | -|`__HIP_PLATFORM_NVIDIA__` | Undefined | Defined | Defined if targeting NVIDIA platform; undefined otherwise | -|`__HIP_DEVICE_COMPILE__` | 1 if compiling for device; undefined if compiling for host | 1 if compiling for device; undefined if compiling for host | Undefined -|`__HIPCC__` | Defined | Defined | Undefined -|`__HIP_ARCH_*` | 0 or 1 depending on feature support (see below) | 0 or 1 depending on feature support (see below) | 0 -|NVCC-related defines:| -|`__CUDACC__` | Defined if source code is compiled by NVCC; undefined otherwise | Undefined -|`__NVCC__` Undefined | Defined | Undefined -|`__CUDA_ARCH__` | Undefined | Unsigned representing compute capability (e.g., "130") if in device code; 0 if in host code | Undefined -|hip-clang-related defines:| -|`__HIP__` | Defined | Undefined | Undefined -|HIP-Clang common defines: | -|`__clang__` | Defined | Defined | Undefined | Defined if using Clang; otherwise undefined - -## Identifying Architecture Features - -### HIP_ARCH Defines - -Some CUDA code tests `__CUDA_ARCH__` for a specific value to determine whether the machine supports a certain architectural feature. For instance, - -```cpp -#if (__CUDA_ARCH__ >= 130) -// doubles are supported -``` - -This type of code requires special attention, since AMD and CUDA devices have different architectural capabilities. Moreover, you can't determine the presence of a feature using a simple comparison against an architecture's version number. HIP provides a set of defines and device properties to query whether a specific architectural feature is supported. - -The `__HIP_ARCH_*` defines can replace comparisons of `__CUDA_ARCH__` values: - -```cpp -//#if (__CUDA_ARCH__ >= 130) // non-portable -if __HIP_ARCH_HAS_DOUBLES__ { // portable HIP feature query - // doubles are supported -} -``` - -For host code, the `__HIP_ARCH__*` defines are set to 0. You should only use the `__HIP_ARCH__` fields in device code. - -### Device-Architecture Properties - -Host code should query the architecture feature flags in the device properties that `hipGetDeviceProperties` returns, rather than testing the "major" and "minor" fields directly: - -```cpp -hipGetDeviceProperties(&deviceProp, device); -//if ((deviceProp.major == 1 && deviceProp.minor < 2)) // non-portable -if (deviceProp.arch.hasSharedInt32Atomics) { // portable HIP feature query - // has shared int32 atomic operations ... -} -``` - -### Table of Architecture Properties - -The table below shows the full set of architectural properties that HIP supports. - -|Define (use only in device code) | Device Property (run-time query) | Comment | -|------- | --------- | ----- | -|32-bit atomics: | | -|`__HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__` | `hasGlobalInt32Atomics` |32-bit integer atomics for global memory -|`__HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__` | `hasGlobalFloatAtomicExch` |32-bit float atomic exchange for global memory -|`__HIP_ARCH_HAS_SHARED_INT32_ATOMICS__` | `hasSharedInt32Atomics` |32-bit integer atomics for shared memory -|`__HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__` | `hasSharedFloatAtomicExch` |32-bit float atomic exchange for shared memory -|`__HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__` | `hasFloatAtomicAdd` |32-bit float atomic add in global and shared memory -|64-bit atomics: | | -|`__HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__` | `hasGlobalInt64Atomics` |64-bit integer atomics for global memory -|`__HIP_ARCH_HAS_SHARED_INT64_ATOMICS__` | `hasSharedInt64Atomics` |64-bit integer atomics for shared memory -|Doubles: | | -|`__HIP_ARCH_HAS_DOUBLES__` | `hasDoubles` |Double-precision floating point -|Warp cross-lane operations: | | -|`__HIP_ARCH_HAS_WARP_VOTE__` | `hasWarpVote` |Warp vote instructions (`any`, `all`) -|`__HIP_ARCH_HAS_WARP_BALLOT__` | `hasWarpBallot` |Warp ballot instructions -|`__HIP_ARCH_HAS_WARP_SHUFFLE__` | `hasWarpShuffle` |Warp shuffle operations (`shfl_*`) -|`__HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__` | `hasFunnelShift` |Funnel shift two input words into one -|Sync: | | -|`__HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__` | `hasThreadFenceSystem` |`threadfence_system` -|`__HIP_ARCH_HAS_SYNC_THREAD_EXT__` | `hasSyncThreadsExt` |`syncthreads_count`, `syncthreads_and`, `syncthreads_or` -|Miscellaneous: | | -|`__HIP_ARCH_HAS_SURFACE_FUNCS__` | `hasSurfaceFuncs` | -|`__HIP_ARCH_HAS_3DGRID__` | `has3dGrid` | Grids and groups are 3D -|`__HIP_ARCH_HAS_DYNAMIC_PARALLEL__` | `hasDynamicParallelism` | - -## Finding HIP - -Makefiles can use the following syntax to conditionally provide a default HIP_PATH if one does not exist: - -```shell -HIP_PATH ?= $(shell hipconfig --path) -``` - -## Identifying HIP Runtime - -HIP can depend on rocclr, or CUDA as runtime - -* AMD platform -On AMD platform, HIP uses ROCm Compute Language Runtime, called ROCclr. -ROCclr is a virtual device interface that HIP runtimes interact with different backends which allows runtimes to work on Linux , as well as Windows without much efforts. - -* NVIDIA platform -On NVIDIA platform, HIP is just a thin layer on top of CUDA. - -The environment variable `HIP_PLATFORM` specifies the runtime to use. The -platform is detected automatically by HIP. When an AMD graphics driver and an -AMD GPU is detected, `HIP_PLATFORM` is set to `amd`. If both runtimes are -installed, and a specific one should be used, or HIP can't detect the runtime, -setting the environment variable manually tells `hipcc` what compilation path to -choose. To use the CUDA compilation path, set the environment variable to -`HIP_PLATFORM=nvidia`. - -## `hipLaunchKernelGGL` - -`hipLaunchKernelGGL` is a macro that can serve as an alternative way to launch kernel, which accepts parameters of launch configurations (grid dims, group dims, stream, dynamic shared size) followed by a variable number of kernel arguments. -It can replace <<< >>>, if the user so desires. - -## Compiler Options - -hipcc is a portable compiler driver that will call NVCC or HIP-Clang (depending on the target system) and attach all required include and library options. It passes options through to the target compiler. Tools that call hipcc must ensure the compiler options are appropriate for the target compiler. -The `hipconfig` script may helpful in identifying the target platform, compiler and runtime. It can also help set options appropriately. - -### Compiler options supported on AMD platforms - -Here are the main compiler options supported on AMD platforms by HIP-Clang. - -| Option | Description | -| ------ | ----------- | -| `--amdgpu-target=` | [DEPRECATED] This option is being replaced by `--offload-arch=`. Generate code for the given GPU target. Supported targets are gfx701, gfx801, gfx802, gfx803, gfx900, gfx906, gfx908, gfx1010, gfx1011, gfx1012, gfx1030, gfx1031. This option could appear multiple times on the same command line to generate a fat binary for multiple targets. | -| `--fgpu-rdc` | Generate relocatable device code, which allows kernels or device functions calling device functions in different translation units. | -| `-ggdb` | Equivalent to `-g` plus tuning for GDB. This is recommended when using ROCm's GDB to debug GPU code. | -| `--gpu-max-threads-per-block=` | Generate code to support up to the specified number of threads per block. | -| `-O` | Specify the optimization level. | -| `-offload-arch=` | Specify the AMD GPU [target ID](https://clang.llvm.org/docs/ClangOffloadBundler.html#target-id). | -| `-save-temps` | Save the compiler generated intermediate files. | -| `-v` | Show the compilation steps. | - -## Linking Issues - -### Linking With hipcc - -hipcc adds the necessary libraries for HIP as well as for the accelerator compiler (NVCC or AMD compiler). We recommend linking with hipcc since it automatically links the binary to the necessary HIP runtime libraries. It also has knowledge on how to link and to manage the GPU objects. - -### `-lm` Option - -hipcc adds `-lm` by default to the link command. - -## Linking Code With Other Compilers - -CUDA code often uses NVCC for accelerator code (defining and launching kernels, typically defined in `.cu` or `.cuh` files). -It also uses a standard compiler (g++) for the rest of the application. NVCC is a preprocessor that employs a standard host compiler (gcc) to generate the host code. -Code compiled using this tool can employ only the intersection of language features supported by both NVCC and the host compiler. -In some cases, you must take care to ensure the data types and alignment of the host compiler are identical to those of the device compiler. Only some host compilers are supported---for example, recent NVCC versions lack Clang host-compiler capability. - -HIP-Clang generates both device and host code using the same Clang-based compiler. The code uses the same API as gcc, which allows code generated by different gcc-compatible compilers to be linked together. For example, code compiled using HIP-Clang can link with code compiled using "standard" compilers (such as gcc, ICC and Clang). Take care to ensure all compilers use the same standard C++ header and library formats. - -### libc++ and libstdc++ - -hipcc links to libstdc++ by default. This provides better compatibility between g++ and HIP. - -If you pass `--stdlib=libc++` to hipcc, hipcc will use the libc++ library. Generally, libc++ provides a broader set of C++ features while libstdc++ is the standard for more compilers (notably including g++). - -When cross-linking C++ code, any C++ functions that use types from the C++ standard library (including std::string, std::vector and other containers) must use the same standard-library implementation. They include the following: - -* Functions or kernels defined in HIP-Clang that are called from a standard compiler -* Functions defined in a standard compiler that are called from HIP-Clang. - -Applications with these interfaces should use the default libstdc++ linking. - -Applications which are compiled entirely with hipcc, and which benefit from advanced C++ features not supported in libstdc++, and which do not require portability to NVCC, may choose to use libc++. - -### HIP Headers (`hip_runtime.h`, `hip_runtime_api.h`) - -The `hip_runtime.h` and `hip_runtime_api.h` files define the types, functions and enumerations needed to compile a HIP program: - -* `hip_runtime_api.h`: defines all the HIP runtime APIs (e.g., `hipMalloc`) and the types required to call them. A source file that is only calling HIP APIs but neither defines nor launches any kernels can include `hip_runtime_api.h`. `hip_runtime_api.h` uses no custom Heterogeneous Compute (HC) language features and can be compiled using a standard C++ compiler. -* `hip_runtime.h`: included in `hip_runtime_api.h`. It additionally provides the types and defines required to create and launch kernels. hip_runtime.h can be compiled using a standard C++ compiler but will expose a subset of the available functions. - -CUDA has slightly different contents for these two files. In some cases you may need to convert hipified code to include the richer `hip_runtime.h` instead of `hip_runtime_api.h`. - -### Using a Standard C++ Compiler - -You can compile `hip_runtime_api.h` using a standard C or C++ compiler (e.g., gcc or ICC). The HIP include paths and defines (`__HIP_PLATFORM_AMD__` or `__HIP_PLATFORM_NVIDIA__`) must pass to the standard compiler; `hipconfig` then returns the necessary options: - -```bash -> hipconfig --cxx_config - -D__HIP_PLATFORM_AMD__ -I/home/user1/hip/include -``` - -You can capture the `hipconfig` output and passed it to the standard compiler; below is a sample makefile syntax: - -```bash -CPPFLAGS += $(shell $(HIP_PATH)/bin/hipconfig --cpp_config) -``` - -NVCC includes some headers by default. However, HIP does not include default headers, and instead all required files must be explicitly included. -Specifically, files that call HIP run-time APIs or define HIP kernels must explicitly include the appropriate HIP headers. -If the compilation process reports that it cannot find necessary APIs (for example, `error: identifier hipSetDevice is undefined`), -ensure that the file includes hip_runtime.h (or hip_runtime_api.h, if appropriate). -The hipify-perl script automatically converts `cuda_runtime.h` to `hip_runtime.h`, and it converts `cuda_runtime_api.h` to `hip_runtime_api.h`, but it may miss nested headers or macros. - -#### `cuda.h` - -The HIP-Clang path provides an empty `cuda.h` file. Some existing CUDA programs include this file but don't require any of the functions. - -### Choosing HIP File Extensions - -Many existing CUDA projects use the `.cu` and `.cuh` file extensions to indicate code that should be run through the NVCC compiler. -For quick HIP ports, leaving these file extensions unchanged is often easier, as it minimizes the work required to change file names in the directory and #include statements in the files. - -For new projects or ports which can be re-factored, we recommend the use of the extension `.hip.cpp` for source files, and -`.hip.h` or `.hip.hpp` for header files. -This indicates that the code is standard C++ code, but also provides a unique indication for make tools to -run hipcc when appropriate. - -## Workarounds - -### ``warpSize`` - -Code should not assume a warp size of 32 or 64. See the -:ref:`HIP language extension for warpSize ` for information on how -to write portable wave-aware code. - -### Kernel launch with group size > 256 - -Kernel code should use `__attribute__((amdgpu_flat_work_group_size(,)))`. - -For example: - -```cpp -__global__ void dot(double *a,double *b,const int n) __attribute__((amdgpu_flat_work_group_size(1, 512))) -``` - -## `memcpyToSymbol` - -HIP support for `hipMemcpyToSymbol` is complete. This feature allows a kernel -to define a device-side data symbol which can be accessed on the host side. The symbol -can be in __constant or device space. - -Note that the symbol name needs to be encased in the HIP_SYMBOL macro, as shown in the code example below. This also applies to `hipMemcpyFromSymbol`, `hipGetSymbolAddress`, and `hipGetSymbolSize`. - -For example: - -Device Code: - -```cpp -#include -#include -#include - -#define HIP_ASSERT(status) \ - assert(status == hipSuccess) - -#define LEN 512 -#define SIZE 2048 - -__constant__ int Value[LEN]; - -__global__ void Get(int *Ad) -{ - int tid = threadIdx.x + blockIdx.x * blockDim.x; - Ad[tid] = Value[tid]; -} - -int main() -{ - int *A, *B, *Ad; - A = new int[LEN]; - B = new int[LEN]; - for(unsigned i=0;i(&ptr), sizeof(double)); -hipPointerAttribute_t attr; -hipPointerGetAttributes(&attr, ptr); /*attr.type will have value as hipMemoryTypeDevice*/ - -double* ptrHost; -hipHostMalloc(&ptrHost, sizeof(double)); -hipPointerAttribute_t attr; -hipPointerGetAttributes(&attr, ptrHost); /*attr.type will have value as hipMemoryTypeHost*/ -``` - -Please note, `hipMemoryType` enum values are different from `cudaMemoryType` enum values. - -For example, on AMD platform, `hipMemoryType` is defined in `hip_runtime_api.h`, - -```cpp -typedef enum hipMemoryType { - hipMemoryTypeHost = 0, ///< Memory is physically located on host - hipMemoryTypeDevice = 1, ///< Memory is physically located on device. (see deviceId for specific device) - hipMemoryTypeArray = 2, ///< Array memory, physically located on device. (see deviceId for specific device) - hipMemoryTypeUnified = 3, ///< Not used currently - hipMemoryTypeManaged = 4 ///< Managed memory, automaticallly managed by the unified memory system -} hipMemoryType; -``` - -Looking into CUDA toolkit, it defines `cudaMemoryType` as following, - -```cpp -enum cudaMemoryType -{ - cudaMemoryTypeUnregistered = 0, // Unregistered memory. - cudaMemoryTypeHost = 1, // Host memory. - cudaMemoryTypeDevice = 2, // Device memory. - cudaMemoryTypeManaged = 3, // Managed memory -} -``` - -In this case, memory type translation for `hipPointerGetAttributes` needs to be handled properly on NVIDIA platform to get the correct memory type in CUDA, which is done in the file `nvidia_hip_runtime_api.h`. - -So in any HIP applications which use HIP APIs involving memory types, developers should use `#ifdef` in order to assign the correct enum values depending on NVIDIA or AMD platform. - -As an example, please see the code from the [link](https://github.com/ROCm/hip-tests/tree/develop/catch/unit/memory/hipMemcpyParam2D.cc). - -With the `#ifdef` condition, HIP APIs work as expected on both AMD and NVIDIA platforms. - -Note, `cudaMemoryTypeUnregstered` is currently not supported in `hipMemoryType` enum, due to HIP functionality backward compatibility. - -## `threadfence_system` - -`threadfence_system` makes all device memory writes, all writes to mapped host memory, and all writes to peer memory visible to CPU and other GPU devices. -Some implementations can provide this behavior by flushing the GPU L2 cache. -HIP/HIP-Clang does not provide this functionality. As a workaround, users can set the environment variable `HSA_DISABLE_CACHE=1` to disable the GPU L2 cache. This will affect all accesses and for all kernels and so may have a performance impact. - -### Textures and Cache Control - -Compute programs sometimes use textures either to access dedicated texture caches or to use the texture-sampling hardware for interpolation and clamping. The former approach uses simple point samplers with linear interpolation, essentially only reading a single point. The latter approach uses the sampler hardware to interpolate and combine multiple samples. AMD hardware, as well as recent competing hardware, has a unified texture/L1 cache, so it no longer has a dedicated texture cache. But the NVCC path often caches global loads in the L2 cache, and some programs may benefit from explicit control of the L1 cache contents. We recommend the `__ldg` instruction for this purpose. - -AMD compilers currently load all data into both the L1 and L2 caches, so `__ldg` is treated as a no-op. - -We recommend the following for functional portability: - -* For programs that use textures only to benefit from improved caching, use the `__ldg` instruction -* Programs that use texture object and reference APIs, work well on HIP - -## More Tips - -### HIP Logging - -On an AMD platform, set the AMD_LOG_LEVEL environment variable to log HIP application execution information. - -The value of the setting controls different logging level, - -```cpp -enum LogLevel { -LOG_NONE = 0, -LOG_ERROR = 1, -LOG_WARNING = 2, -LOG_INFO = 3, -LOG_DEBUG = 4 -}; -``` - -Logging mask is used to print types of functionalities during the execution of HIP application. -It can be set as one of the following values, - -```cpp -enum LogMask { - LOG_API = 1, //!< (0x1) API call - LOG_CMD = 2, //!< (0x2) Kernel and Copy Commands and Barriers - LOG_WAIT = 4, //!< (0x4) Synchronization and waiting for commands to finish - LOG_AQL = 8, //!< (0x8) Decode and display AQL packets - LOG_QUEUE = 16, //!< (0x10) Queue commands and queue contents - LOG_SIG = 32, //!< (0x20) Signal creation, allocation, pool - LOG_LOCK = 64, //!< (0x40) Locks and thread-safety code. - LOG_KERN = 128, //!< (0x80) Kernel creations and arguments, etc. - LOG_COPY = 256, //!< (0x100) Copy debug - LOG_COPY2 = 512, //!< (0x200) Detailed copy debug - LOG_RESOURCE = 1024, //!< (0x400) Resource allocation, performance-impacting events. - LOG_INIT = 2048, //!< (0x800) Initialization and shutdown - LOG_MISC = 4096, //!< (0x1000) Misc debug, not yet classified - LOG_AQL2 = 8192, //!< (0x2000) Show raw bytes of AQL packet - LOG_CODE = 16384, //!< (0x4000) Show code creation debug - LOG_CMD2 = 32768, //!< (0x8000) More detailed command info, including barrier commands - LOG_LOCATION = 65536, //!< (0x10000) Log message location - LOG_MEM = 131072, //!< (0x20000) Memory allocation - LOG_MEM_POOL = 262144, //!< (0x40000) Memory pool allocation, including memory in graphs - LOG_ALWAYS = -1 //!< (0xFFFFFFFF) Log always even mask flag is zero -}; -``` - -### Debugging hipcc - -To see the detailed commands that hipcc issues, set the environment variable HIPCC_VERBOSE to 1. Doing so will print to ``stderr`` the HIP-clang (or NVCC) commands that hipcc generates. - -```bash -export HIPCC_VERBOSE=1 -make -... -hipcc-cmd: /opt/rocm/bin/hipcc --offload-arch=native -x hip backprop_cuda.cu -``` - -### Editor Highlighting - -See the utils/vim or utils/gedit directories to add handy highlighting to hip files. diff --git a/docs/how-to/hip_porting_guide.rst b/docs/how-to/hip_porting_guide.rst new file mode 100644 index 0000000000..af248b3ec9 --- /dev/null +++ b/docs/how-to/hip_porting_guide.rst @@ -0,0 +1,604 @@ +.. meta:: + :description: This chapter presents how to port CUDA source code to HIP. + :keywords: AMD, ROCm, HIP, CUDA, porting, port + +################################################################################ +HIP porting guide +################################################################################ + +HIP is designed to ease the porting of existing CUDA code into the HIP +environment. This page describes the available tools and provides practical +suggestions on how to port CUDA code and work through common issues. + +******************************************************************************** +Porting a CUDA Project +******************************************************************************** + +General Tips +================================================================================ + +* You can incrementally port pieces of the code to HIP while leaving the rest in CUDA. HIP is just a thin layer over CUDA, so the two languages can interoperate. +* Starting to port on an NVIDIA machine is often the easiest approach, as the code can be tested for functionality and performance even if not fully ported to HIP. +* Once the CUDA code is ported to HIP and is running on the CUDA machine, compile the HIP code for an AMD machine. +* You can handle platform-specific features through conditional compilation or by adding them to the open-source HIP infrastructure. +* Use the `HIPIFY `_ tools to automatically convert CUDA code to HIP, as described in the following section. + +HIPIFY +================================================================================ + +:doc:`HIPIFY ` is a collection of tools that automatically +translate CUDA to HIP code. There are two flavours available, ``hipfiy-clang`` +and ``hipify-perl``. + +:doc:`hipify-clang ` is, as the name implies, a Clang-based +tool, and actually parses the code, translates it into an Abstract Syntax Tree, +from which it then generates the HIP source. For this, ``hipify-clang`` needs to +be able to actually compile the code, so the CUDA code needs to be correct, and +a CUDA install with all necessary headers must be provided. + +:doc:`hipify-perl ` uses pattern matching, to translate the +CUDA code to HIP. It does not require a working CUDA installation, and can also +convert CUDA code, that is not syntactically correct. It is therefore easier to +set up and use, but is not as powerful as ``hipfiy-clang``. + +Scanning existing CUDA code to scope the porting effort +-------------------------------------------------------------------------------- + +The ``--examine`` option, supported by the clang and perl version, tells hipify +to do a test-run, without changing the files, but instead scan CUDA code to +determine which files contain CUDA code and how much of that code can +automatically be hipified. + +There also are ``hipexamine-perl.sh`` or ``hipexamine.sh`` (for +``hipify-clang``) scripts to automatically scan directories. + +For example, the following is a scan of one of the +`cuda-samples `_: + +.. code-block:: shell + + > cd Samples/2_Concepts_and_Techniques/convolutionSeparable/ + > hipexamine-perl.sh + [HIPIFY] info: file './convolutionSeparable.cu' statistics: + CONVERTED refs count: 2 + TOTAL lines of code: 214 + WARNINGS: 0 + [HIPIFY] info: CONVERTED refs by names: + cooperative_groups.h => hip/hip_cooperative_groups.h: 1 + cudaMemcpyToSymbol => hipMemcpyToSymbol: 1 + + [HIPIFY] info: file './main.cpp' statistics: + CONVERTED refs count: 13 + TOTAL lines of code: 174 + WARNINGS: 0 + [HIPIFY] info: CONVERTED refs by names: + cudaDeviceSynchronize => hipDeviceSynchronize: 2 + cudaFree => hipFree: 3 + cudaMalloc => hipMalloc: 3 + cudaMemcpy => hipMemcpy: 2 + cudaMemcpyDeviceToHost => hipMemcpyDeviceToHost: 1 + cudaMemcpyHostToDevice => hipMemcpyHostToDevice: 1 + cuda_runtime.h => hip/hip_runtime.h: 1 + + [HIPIFY] info: file 'GLOBAL' statistics: + CONVERTED refs count: 15 + TOTAL lines of code: 512 + WARNINGS: 0 + [HIPIFY] info: CONVERTED refs by names: + cooperative_groups.h => hip/hip_cooperative_groups.h: 1 + cudaDeviceSynchronize => hipDeviceSynchronize: 2 + cudaFree => hipFree: 3 + cudaMalloc => hipMalloc: 3 + cudaMemcpy => hipMemcpy: 2 + cudaMemcpyDeviceToHost => hipMemcpyDeviceToHost: 1 + cudaMemcpyHostToDevice => hipMemcpyHostToDevice: 1 + cudaMemcpyToSymbol => hipMemcpyToSymbol: 1 + cuda_runtime.h => hip/hip_runtime.h: 1 + +``hipexamine-perl.sh`` reports how many CUDA calls are going to be converted to +HIP (e.g. ``CONVERTED refs count: 2``), and lists them by name together with +their corresponding HIP-version (see the lines following ``[HIPIFY] info: +CONVERTED refs by names:``). It also lists the total lines of code for the file +and potential warnings. In the end it prints a summary for all files. + +Automatically converting a CUDA project +-------------------------------------------------------------------------------- + +To directly replace the files, the ``--inplace`` option of ``hipify-perl`` or +``hipify-clang`` can be used. This creates a backup of the original files in a +``.prehip`` file and overwrites the existing files, keeping their file +endings. If the ``--inplace`` option is not given, the scripts print the +hipified code to ``stdout``. + +``hipconvertinplace.sh``or ``hipconvertinplace-perl.sh`` operate on whole +directories. + +Library Equivalents +================================================================================ + +ROCm provides libraries to ease porting of code relying on CUDA libraries. +Most CUDA libraries have a corresponding HIP library. + +There are two flavours of libraries provided by ROCm, ones prefixed with ``hip`` +and ones prefixed with ``roc``. While both are written using HIP, in general +only the ``hip``-libraries are portable. The libraries with the ``roc``-prefix +might also run on CUDA-capable GPUs, however they have been optimized for AMD +GPUs and might use assembly code or a different API, to achieve the best +performance. + +.. note:: + + If the application is only required to run on AMD GPUs, it is recommended to + use the ``roc``-libraries. + +In the case where a library provides a ``roc``- and a ``hip``- version, the +``hip`` version is a marshalling library, which is just a thin layer that is +redirecting the function calls to either the ``roc``-library or the +corresponding CUDA library, depending on the platform, to provide compatibility. + +.. list-table:: + :header-rows: 1 + + * + - CUDA Library + - ``hip`` Library + - ``roc`` Library + - Comment + * + - cuBLAS + - `hipBLAS `_ + - `rocBLAS `_ + - Basic Linear Algebra Subroutines + * + - cuBLASLt + - `hipBLASLt `_ + - + - Linear Algebra Subroutines, lightweight and new flexible API + * + - cuFFT + - `hipFFT `_ + - `rocFFT `_ + - Fast Fourier Transfer Library + * + - cuSPARSE + - `hipSPARSE `_ + - `rocSPARSE `_ + - Sparse BLAS + SPMV + * + - cuSOLVER + - `hipSOLVER `_ + - `rocSOLVER `_ + - Lapack library + * + - AmgX + - + - `rocALUTION `_ + - Sparse iterative solvers and preconditioners with algebraic multigrid + * + - Thrust + - + - `rocThrust `_ + - C++ parallel algorithms library + * + - CUB + - `hipCUB `_ + - `rocPRIM `_ + - Low Level Optimized Parallel Primitives + * + - cuDNN + - + - `MIOpen `_ + - Deep learning Solver Library + * + - cuRAND + - `hipRAND `_ + - `rocRAND `_ + - Random Number Generator Library + * + - NCCL + - + - `RCCL `_ + - Communications Primitives Library based on the MPI equivalents + RCCL is a drop-in replacement for NCCL + +******************************************************************************** +Distinguishing compilers and platforms +******************************************************************************** + +Identifying the HIP Target Platform +================================================================================ + +HIP projects can target either the AMD or NVIDIA platform. The platform affects +which backend-headers are included and which libraries are used for linking. The +created binaries are not portable between AMD and NVIDIA platforms. + +To write code that is specific to a platform the C++-macros specified in the +following section can be used. + +Compiler Defines: Summary +-------------------------------------------------------------------------------- + +This section lists macros that are defined by compilers and the HIP/CUDA APIs, +and what compiler/platform combinations they are defined for. + +The following table lists the macros that can be used when compiling HIP. Most +of these macros are not directly defined by the compilers, but in +``hip_common.h``, which is included by ``hip_runtime.h``. + +.. list-table:: HIP-related defines + :header-rows: 1 + + * + - Macro + - ``amdclang++`` + - ``nvcc`` when used as backend for ``hipcc`` + - Other (GCC, ICC, Clang, etc.) + * + - ``__HIP_PLATFORM_AMD__`` + - Defined + - Undefined + - Undefined, needs to be set explicitly + * + - ``__HIP_PLATFORM_NVIDIA__`` + - Undefined + - Defined + - Undefined, needs to be set explicitly + * + - ``__HIPCC__`` + - Defined when compiling ``.hip`` files or specifying ``-x hip`` + - Defined when compiling ``.hip`` files or specifying ``-x hip`` + - Undefined + * + - ``__HIP_DEVICE_COMPILE__`` + - 1 if compiling for device + undefined if compiling for host + - 1 if compiling for device + undefined if compiling for host + - Undefined + * + - ``__HIP_ARCH___`` + - 0 or 1 depending on feature support of targeted hardware (see :ref:`identifying_device_architecture_features`) + - 0 or 1 depending on feature support of targeted hardware + - 0 + * + - ``__HIP__`` + - Defined when compiling ``.hip`` files or specifying ``-x hip`` + - Undefined + - Undefined + +The following table lists macros related to ``nvcc`` and CUDA as HIP backend. + +.. list-table:: NVCC-related defines + :header-rows: 1 + + * + - Macro + - ``amdclang++`` + - ``nvcc`` when used as backend for ``hipcc`` + - Other (GCC, ICC, Clang, etc.) + * + - ``__CUDACC__`` + - Undefined + - Defined + - Undefined + (Clang defines this when explicitly compiling CUDA code) + * + - ``__NVCC__`` + - Undefined + - Defined + - Undefined + * + - ``__CUDA_ARCH__`` [#cuda_arch]_ + - Undefined + - Defined in device code + Integer representing compute capability + Must not be used in host code + - Undefined + +.. [#cuda_arch] the use of ``__CUDA_ARCH__`` to check for hardware features is + discouraged, as this is not portable. Use the ``__HIP_ARCH_HAS_`` + macros instead. + +Identifying the compilation target platform +-------------------------------------------------------------------------------- + +Despite HIP's portability, it can be necessary to tailor code to a specific +platform, in order to provide platform-specific code, or aid in +platform-specific performance improvements. + +For this, the ``__HIP_PLATFORM_AMD__`` and ``__HIP_PLATFORM_NVIDIA__`` macros +can be used, e.g.: + +.. code-block:: cpp + + #ifdef __HIP_PLATFORM_AMD__ + // This code path is compiled when amdclang++ is used for compilation + #endif + +.. code-block:: cpp + + #ifdef __HIP_PLATFORM_NVIDIA__ + // This code path is compiled when nvcc is used for compilation + // Could be compiling with CUDA language extensions enabled (for example, a ".cu file) + // Could be in pass-through mode to an underlying host compiler (for example, a .cpp file) + #endif + +When using ``hipcc``, the environment variable ``HIP_PLATFORM`` specifies the +runtime to use. When an AMD graphics driver and an AMD GPU is detected, +``HIP_PLATFORM`` is set to ``amd``. If both runtimes are installed, and a +specific one should be used, or ``hipcc`` can't detect the runtime, the +environment variable has to be set manually. + +To explicitly use the CUDA compilation path, use: + +.. code-block:: bash + + export HIP_PLATFORM=nvidia + hipcc main.cpp + +Identifying Host or Device Compilation Pass +-------------------------------------------------------------------------------- + +``amdclang++`` makes multiple passes over the code: one for the host code, and +one each for the device code for every GPU architecture to be compiled for. +``nvcc`` makes two passes over the code: one for host code and one for device +code. + +The ``__HIP_DEVICE_COMPILE__``-macro is defined when the compiler is compiling +for the device. + + +``__HIP_DEVICE_COMPILE__`` is a portable check that can replace the +``__CUDA_ARCH__``. + +.. code-block:: cpp + + #include "hip/hip_runtime.h" + #include + + __host__ __device__ void call_func(){ + #ifdef __HIP_DEVICE_COMPILE__ + printf("device\n"); + #else + std::cout << "host" << std::endl; + #endif + } + + __global__ void test_kernel(){ + call_func(); + } + + int main(int argc, char** argv) { + test_kernel<<<1, 1, 0, 0>>>(); + + call_func(); + } + +.. _identifying_device_architecture_features: + +******************************************************************************** +Identifying Device Architecture Features +******************************************************************************** + +GPUs of different generations and architectures do not all provide the same +level of :doc:`hardware feature support <../reference/hardware_features>`. To +guard device-code using these architecture dependent features, the +``__HIP_ARCH___`` C++-macros can be used. + +Device Code Feature Identification +================================================================================ + +Some CUDA code tests ``__CUDA_ARCH__`` for a specific value to determine whether +the GPU supports a certain architectural feature, depending on its compute +capability. This requires knowledge about what ``__CUDA_ARCH__`` supports what +feature set. + +HIP simplifies this, by replacing these macros with feature-specific macros, not +architecture specific. + +For instance, + +.. code-block:: cpp + + //#if __CUDA_ARCH__ >= 130 // does not properly specify, what feature is required, not portable + #if __HIP_ARCH_HAS_DOUBLES__ == 1 // explicitly specifies, what feature is required, portable between AMD and NVIDIA GPUs + // device code + #endif + +For host code, the ``__HIP_ARCH___`` defines are set to 0, if +``hip_runtime.h`` is included, and undefined otherwise. It should not be relied +upon in host code. + +Host Code Feature Identification +================================================================================ + +Host code must not rely on the ``__HIP_ARCH___`` macros, as the GPUs +available to a system can not be known during compile time, and their +architectural features differ. + +Host code can query architecture feature flags during runtime, by using +:cpp:func:`hipGetDeviceProperties` or :cpp:func:`hipDeviceGetAttribute`. + +.. code-block:: cpp + + #include + #include + #include + + #define HIP_CHECK(expression) { \ + const hipError_t err = expression; \ + if (err != hipSuccess){ \ + std::cout << "HIP Error: " << hipGetErrorString(err)) \ + << " at line " << __LINE__ << std::endl; \ + std::exit(EXIT_FAILURE); \ + } \ + } + + int main(){ + int deviceCount; + HIP_CHECK(hipGetDeviceCount(&deviceCount)); + + int device = 0; // Query first available GPU. Can be replaced with any + // integer up to, not including, deviceCount + hipDeviceProp_t deviceProp; + HIP_CHECK(hipGetDeviceProperties(&deviceProp, device)); + + std::cout << "The queried device "; + if (deviceProp.arch.hasSharedInt32Atomics) // portable HIP feature query + std::cout << "supports"; + else + std::cout << "does not support"; + std::cout << " shared int32 atomic operations" << std::endl; + } + +Table of Architecture Properties +================================================================================ + +The table below shows the full set of architectural properties that HIP +supports, together with the corresponding macros and device properties. + +.. list-table:: + :header-rows: 1 + + * + - Macro (for device code) + - Device Property (host runtime query) + - Comment + * + - ``__HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__`` + - ``hasGlobalInt32Atomics`` + - 32-bit integer atomics for global memory + * + - ``__HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__`` + - ``hasGlobalFloatAtomicExch`` + - 32-bit float atomic exchange for global memory + * + - ``__HIP_ARCH_HAS_SHARED_INT32_ATOMICS__`` + - ``hasSharedInt32Atomics`` + - 32-bit integer atomics for shared memory + * + - ``__HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__`` + - ``hasSharedFloatAtomicExch`` + - 32-bit float atomic exchange for shared memory + * + - ``__HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__`` + - ``hasFloatAtomicAdd`` + - 32-bit float atomic add in global and shared memory + * + - ``__HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__`` + - ``hasGlobalInt64Atomics`` + - 64-bit integer atomics for global memory + * + - ``__HIP_ARCH_HAS_SHARED_INT64_ATOMICS__`` + - ``hasSharedInt64Atomics`` + - 64-bit integer atomics for shared memory + * + - ``__HIP_ARCH_HAS_DOUBLES__`` + - ``hasDoubles`` + - Double-precision floating-point operations + * + - ``__HIP_ARCH_HAS_WARP_VOTE__`` + - ``hasWarpVote`` + - Warp vote instructions (``any``, ``all``) + * + - ``__HIP_ARCH_HAS_WARP_BALLOT__`` + - ``hasWarpBallot`` + - Warp ballot instructions + * + - ``__HIP_ARCH_HAS_WARP_SHUFFLE__`` + - ``hasWarpShuffle`` + - Warp shuffle operations (``shfl_*``) + * + - ``__HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__`` + - ``hasFunnelShift`` + - Funnel shift two input words into one + * + - ``__HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__`` + - ``hasThreadFenceSystem`` + - :cpp:func:`threadfence_system` + * + - ``__HIP_ARCH_HAS_SYNC_THREAD_EXT__`` + - ``hasSyncThreadsExt`` + - :cpp:func:`syncthreads_count`, :cpp:func:`syncthreads_and`, :cpp:func:`syncthreads_or` + * + - ``__HIP_ARCH_HAS_SURFACE_FUNCS__`` + - ``hasSurfaceFuncs`` + - Supports :ref:`surface functions `. + * + - ``__HIP_ARCH_HAS_3DGRID__`` + - ``has3dGrid`` + - Grids and groups are 3D + * + - ``__HIP_ARCH_HAS_DYNAMIC_PARALLEL__`` + - ``hasDynamicParallelism`` + - Ability to launch a kernel from within a kernel + +******************************************************************************** +Finding HIP +******************************************************************************** + +Makefiles can use the following syntax to conditionally provide a default HIP_PATH if one does not exist: + +.. code-block:: shell + + HIP_PATH ?= $(shell hipconfig --path) + +******************************************************************************** +Compilation +******************************************************************************** + +``hipcc`` is a portable compiler driver that calls ``nvcc`` or ``amdclang++`` +and forwards the appropriate options. It passes options through +to the target compiler. Tools that call ``hipcc`` must ensure the compiler +options are appropriate for the target compiler. + +``hipconfig`` is a helpful tool in identifying the current systems platform, +compiler and runtime. It can also help set options appropriately. + +HIP Headers +================================================================================ + +The ``hip_runtime.h`` headers define all the necessary types, functions, macros, +etc., needed to compile a HIP program, this includes host as well as device +code. ``hip_runtime_api.h`` is a subset of ``hip_runtime.h``. + +CUDA has slightly different contents for these two files. In some cases you may +need to convert hipified code to include the richer ``hip_runtime.h`` instead of +``hip_runtime_api.h``. + +Using a Standard C++ Compiler +================================================================================ + +You can compile ``hip_runtime_api.h`` using a standard C or C++ compiler +(e.g., ``gcc`` or ``icc``). +A source file that is only calling HIP APIs but neither defines nor launches any +kernels can be compiled with a standard host compiler (e.g. ``gcc`` or ``icc``) +even when ``hip_runtime_api.h`` or ``hip_runtime.h`` are included. + +The HIP include paths and platform macros (``__HIP_PLATFORM_AMD__`` or +``__HIP_PLATFORM_NVIDIA__``) must be passed to the compiler. + +``hipconfig`` can help in finding the necessary options, for example on an AMD +platform: + +.. code-block:: bash + + hipconfig --cpp_config + -D__HIP_PLATFORM_AMD__= -I/opt/rocm/include + +``nvcc`` includes some headers by default. ``hipcc`` does not include +default headers, and instead all required files must be explicitly included. + +The ``hipify`` tool automatically converts ``cuda_runtime.h`` to +``hip_runtime.h``, and it converts ``cuda_runtime_api.h`` to +``hip_runtime_api.h``, but it may miss nested headers or macros. + +******************************************************************************** +warpSize +******************************************************************************** + +Code should not assume a warp size of 32 or 64, as that is not portable between +platforms and architectures. The ``warpSize`` built-in should be used in device +code, while the host can query it during runtime via the device properties. See +the :ref:`HIP language extension for warpSize ` for information on +how to write portable wave-aware code. diff --git a/docs/how-to/logging.rst b/docs/how-to/logging.rst index ecf40fa192..3c8b8c5a53 100644 --- a/docs/how-to/logging.rst +++ b/docs/how-to/logging.rst @@ -240,3 +240,16 @@ information when calling the backend runtime. :3:C:\constructicon\builds\gfx\two\22.40\drivers\compute\hipamd\src\hip_memory.cpp:681 : 605414524092 us: 29864: [tid:0x9298] hipMemGetInfo: Returned hipSuccess : memInfo.total: 12.06 GB memInfo.free: 11.93 GB (99%) + +Logging hipcc commands +================================================================================ + +To see the detailed commands that hipcc issues, set the environment variable +``HIPCC_VERBOSE``. Doing so will print the HIP-clang (or NVCC) commands that +hipcc generates to ``stderr``. + +.. code-block:: shell + + export HIPCC_VERBOSE=1 + hipcc main.cpp + hipcc-cmd: /opt/rocm/lib/llvm/bin/clang++ --offload-arch=gfx90a --driver-mode=g++ -O3 --hip-link -x hip main.cpp diff --git a/docs/understand/compilers.rst b/docs/understand/compilers.rst index 53512e76e5..ccd2dbbec6 100644 --- a/docs/understand/compilers.rst +++ b/docs/understand/compilers.rst @@ -21,6 +21,81 @@ On NVIDIA CUDA platform, ``hipcc`` takes care of invoking compiler ``nvcc``. ``amdclang++`` is based on the ``clang++`` compiler. For more details, see the :doc:`llvm project`. +HIPCC +================================================================================ + +Common Compiler Options +-------------------------------------------------------------------------------- + +The following table shows the most common compiler options supported by +``hipcc``. + +.. list-table:: + :header-rows: 1 + + * + - Option + - Description + * + - ``--fgpu-rdc`` + - Generate relocatable device code, which allows kernels or device functions + to call device functions in different translation units. + * + - ``-ggdb`` + - Equivalent to `-g` plus tuning for GDB. This is recommended when using + ROCm's GDB to debug GPU code. + * + - ``--gpu-max-threads-per-block=`` + - Generate code to support up to the specified number of threads per block. + * + - ``-offload-arch=`` + - Generate code for the given GPU target. + For a full list of supported compilation targets see the `processor names in AMDGPU's llvm documentation `_. + This option can appear multiple times to generate a fat binary for multiple + targets. + The actual support of the platform's runtime may differ. + * + - ``-save-temps`` + - Save the compiler generated intermediate files. + * + - ``-v`` + - Show the compilation steps. + +Linking +-------------------------------------------------------------------------------- + +``hipcc`` adds the necessary libraries for HIP as well as for the accelerator +compiler (``nvcc`` or ``amdclang++``). We recommend linking with ``hipcc`` since +it automatically links the binary to the necessary HIP runtime libraries. + +Linking Code With Other Compilers +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``nvcc`` by default uses ``g++`` to generate the host code. + +``amdclang++`` generates both device and host code. The code uses the same API +as ``gcc``, which allows code generated by different ``gcc``-compatible +compilers to be linked together. For example, code compiled using ``amdclang++`` +can link with code compiled using compilers such as ``gcc``, ``icc`` and +``clang``. Take care to ensure all compilers use the same standard C++ header +and library formats. + +libc++ and libstdc++ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``hipcc`` links to ``libstdc++`` by default. This provides better compatibility +between ``g++`` and HIP. + +In order to link to ``libc++``, pass ``--stdlib=libc++`` to ``hipcc``. +Generally, libc++ provides a broader set of C++ features while ``libstdc++`` is +the standard for more compilers, notably including ``g++``. + +When cross-linking C++ code, any C++ functions that use types from the C++ +standard library, such as ``std::string``, ``std::vector`` and other containers, +must use the same standard-library implementation. This includes cross-linking +between ``amdclang++`` and other compilers. + + HIP compilation workflow ================================================================================ From 780f7987ffff13ec20ffe48db970b7f772741051 Mon Sep 17 00:00:00 2001 From: Istvan Kiss Date: Thu, 13 Feb 2025 17:43:29 +0100 Subject: [PATCH 05/32] Docs: Expand HIP porting guide and CUDA driver porting guide --- docs/how-to/hip_cpp_language_extensions.rst | 37 ----- docs/how-to/hip_porting_driver_api.rst | 155 ++++++++++++++++---- docs/how-to/hip_porting_guide.rst | 68 +++++++-- 3 files changed, 182 insertions(+), 78 deletions(-) diff --git a/docs/how-to/hip_cpp_language_extensions.rst b/docs/how-to/hip_cpp_language_extensions.rst index 1c55955b5d..4798b1d9c1 100644 --- a/docs/how-to/hip_cpp_language_extensions.rst +++ b/docs/how-to/hip_cpp_language_extensions.rst @@ -250,43 +250,6 @@ Units, also known as SIMDs, each with their own register file. For more information see :doc:`../understand/hardware_implementation`. :cpp:struct:`hipDeviceProp_t` also has a field ``executionUnitsPerMultiprocessor``. -Porting from CUDA __launch_bounds__ -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -CUDA also defines a ``__launch_bounds__`` qualifier which works similar to HIP's -implementation, however it uses different parameters: - -.. code-block:: cpp - - __launch_bounds__(MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MULTIPROCESSOR) - -The first parameter is the same as HIP's implementation, but -``MIN_BLOCKS_PER_MULTIPROCESSOR`` must be converted to -``MIN_WARPS_PER_EXECUTION``, which uses warps and execution units rather than -blocks and multiprocessors. This conversion is performed automatically by -:doc:`HIPIFY `, or can be done manually with the following -equation. - -.. code-block:: cpp - - MIN_WARPS_PER_EXECUTION_UNIT = (MIN_BLOCKS_PER_MULTIPROCESSOR * MAX_THREADS_PER_BLOCK) / warpSize - -Directly controlling the warps per execution unit makes it easier to reason -about the occupancy, unlike with blocks, where the occupancy depends on the -block size. - -The use of execution units rather than multiprocessors also provides support for -architectures with multiple execution units per multiprocessor. For example, the -AMD GCN architecture has 4 execution units per multiprocessor. - -maxregcount -"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" - -Unlike ``nvcc``, ``amdclang++`` does not support the ``--maxregcount`` option. -Instead, users are encouraged to use the ``__launch_bounds__`` directive since -the parameters are more intuitive and portable than micro-architecture details -like registers. The directive allows per-kernel control. - Memory space qualifiers ================================================================================ diff --git a/docs/how-to/hip_porting_driver_api.rst b/docs/how-to/hip_porting_driver_api.rst index d4d9da1673..41a7aff497 100644 --- a/docs/how-to/hip_porting_driver_api.rst +++ b/docs/how-to/hip_porting_driver_api.rst @@ -1,6 +1,6 @@ .. meta:: :description: This chapter presents how to port the CUDA driver API and showcases equivalent operations in HIP. - :keywords: AMD, ROCm, HIP, CUDA, driver API + :keywords: AMD, ROCm, HIP, CUDA, driver API, porting, port .. _porting_driver_api: @@ -8,26 +8,25 @@ Porting CUDA driver API ******************************************************************************* -NVIDIA provides separate CUDA driver and runtime APIs. The two APIs have -significant overlap in functionality: - -* Both APIs support events, streams, memory management, memory copy, and error - handling. - -* Both APIs deliver similar performance. +CUDA provides separate driver and runtime APIs. The two APIs generally provide +the similar functionality and mostly can be used interchangeably, however the +driver API allows for more fine-grained control over the kernel level +initialization, contexts and module management. This is all taken care of +implicitly by the runtime API. * Driver API calls begin with the prefix ``cu``, while runtime API calls begin with the prefix ``cuda``. For example, the driver API contains ``cuEventCreate``, while the runtime API contains ``cudaEventCreate``, which has similar functionality. -* The driver API defines a different, but largely overlapping, error code space - than the runtime API and uses a different coding convention. For example, the - driver API defines ``CUDA_ERROR_INVALID_VALUE``, while the runtime API defines - ``cudaErrorInvalidValue``. +* The driver API offers two additional low-level functionalities not exposed by + the runtime API: module management ``cuModule*`` and context management + ``cuCtx*`` APIs. -The driver API offers two additional functionalities not provided by the runtime -API: ``cuModule`` and ``cuCtx`` APIs. +HIP does not explicitly provide two different APIs, the corresponding functions +for the CUDA driver API are available in the HIP runtime API, and are usually +prefixed with ``hipDrv``. The module and context functionality is available with +the ``hipModule`` and ``hipCtx`` prefix. cuModule API ================================================================================ @@ -120,12 +119,21 @@ For context reference, visit :ref:`context_management_reference`. HIPIFY translation of CUDA driver API ================================================================================ -The HIPIFY tools convert CUDA driver APIs for streams, events, modules, devices, memory management, context, and the profiler to the equivalent HIP calls. For example, ``cuEventCreate`` is translated to ``hipEventCreate``. -HIPIFY tools also convert error codes from the driver namespace and coding conventions to the equivalent HIP error code. HIP unifies the APIs for these common functions. - -The memory copy API requires additional explanation. The CUDA driver includes the memory direction in the name of the API (``cuMemcpyH2D``), while the CUDA driver API provides a single memory copy API with a parameter that specifies the direction. It also supports a "default" direction where the runtime determines the direction automatically. -HIP provides APIs with both styles, for example, ``hipMemcpyH2D`` as well as ``hipMemcpy``. -The first version might be faster in some cases because it avoids any host overhead to detect the different memory directions. +The HIPIFY tools convert CUDA driver APIs such as streams, events, modules, +devices, memory management, context, and the profiler to the equivalent HIP +calls. For example, ``cuEventCreate`` is translated to :cpp:func:`hipEventCreate`. +HIPIFY tools also convert error codes from the driver namespace and coding +conventions to the equivalent HIP error code. HIP unifies the APIs for these +common functions. + +The memory copy API requires additional explanation. The CUDA driver includes +the memory direction in the name of the API (``cuMemcpyHtoD``), while the CUDA +runtime API provides a single memory copy API with a parameter that specifies +the direction. It also supports a "default" direction where the runtime +determines the direction automatically. +HIP provides both versions, for example, :cpp:func:`hipMemcpyHtoD` as well as +:cpp:func:`hipMemcpy`. The first version might be faster in some cases because +it avoids any host overhead to detect the different memory directions. HIP defines a single error space and uses camel case for all errors (i.e. ``hipErrorInvalidValue``). @@ -134,16 +142,25 @@ For further information, visit the :doc:`hipify:index`. Address spaces -------------------------------------------------------------------------------- -HIP-Clang defines a process-wide address space where the CPU and all devices allocate addresses from a single unified pool. -This means addresses can be shared between contexts. Unlike the original CUDA implementation, a new context does not create a new address space for the device. +HIP-Clang defines a process-wide address space where the CPU and all devices +allocate addresses from a single unified pool. +This means addresses can be shared between contexts. Unlike the original CUDA +implementation, a new context does not create a new address space for the device. Using hipModuleLaunchKernel -------------------------------------------------------------------------------- -Both CUDA driver and runtime APIs define a function for launching kernels, called ``cuLaunchKernel`` or ``cudaLaunchKernel``. The equivalent API in HIP is ``hipModuleLaunchKernel``. -The kernel arguments and the execution configuration (grid dimensions, group dimensions, dynamic shared memory, and stream) are passed as arguments to the launch function. -The runtime API additionally provides the ``<<< >>>`` syntax for launching kernels, which resembles a special function call and is easier to use than the explicit launch API, especially when handling kernel arguments. -However, this syntax is not standard C++ and is available only when NVCC is used to compile the host code. +Both CUDA driver and runtime APIs define a function for launching kernels, +called ``cuLaunchKernel`` or ``cudaLaunchKernel``. The equivalent API in HIP is +``hipModuleLaunchKernel``. +The kernel arguments and the execution configuration (grid dimensions, group +dimensions, dynamic shared memory, and stream) are passed as arguments to the +launch function. +The runtime API additionally provides the ``<<< >>>`` syntax for launching +kernels, which resembles a special function call and is easier to use than the +explicit launch API, especially when handling kernel arguments. +However, this syntax is not standard C++ and is available only when NVCC is used +to compile the host code. Additional information -------------------------------------------------------------------------------- @@ -186,12 +203,24 @@ functions. Kernel launching -------------------------------------------------------------------------------- -HIP-Clang supports kernel launching using either the CUDA ``<<<>>>`` syntax, ``hipLaunchKernel``, or ``hipLaunchKernelGGL``. The last option is a macro which expands to the CUDA ``<<<>>>`` syntax by default. It can also be turned into a template by defining ``HIP_TEMPLATE_KERNEL_LAUNCH``. +HIP-Clang supports kernel launching using either the CUDA ``<<<>>>`` syntax, +``hipLaunchKernel``, or ``hipLaunchKernelGGL``. The last option is a macro which +expands to the CUDA ``<<<>>>`` syntax by default. It can also be turned into a +template by defining ``HIP_TEMPLATE_KERNEL_LAUNCH``. -When the executable or shared library is loaded by the dynamic linker, the initialization functions are called. In the initialization functions, the code objects containing all kernels are loaded when ``__hipRegisterFatBinary`` is called. When ``__hipRegisterFunction`` is called, the stub functions are associated with the corresponding kernels in the code objects. +When the executable or shared library is loaded by the dynamic linker, the +initialization functions are called. In the initialization functions, the code +objects containing all kernels are loaded when ``__hipRegisterFatBinary`` is +called. When ``__hipRegisterFunction`` is called, the stub functions are +associated with the corresponding kernels in the code objects. HIP-Clang implements two sets of APIs for launching kernels. -By default, when HIP-Clang encounters the ``<<<>>>`` statement in the host code, it first calls ``hipConfigureCall`` to set up the threads and grids. It then calls the stub function with the given arguments. The stub function calls ``hipSetupArgument`` for each kernel argument, then calls ``hipLaunchByPtr`` with a function pointer to the stub function. In ``hipLaunchByPtr``, the actual kernel associated with the stub function is launched. +By default, when HIP-Clang encounters the ``<<<>>>`` statement in the host code, +it first calls ``hipConfigureCall`` to set up the threads and grids. It then +calls the stub function with the given arguments. The stub function calls +``hipSetupArgument`` for each kernel argument, then calls ``hipLaunchByPtr`` +with a function pointer to the stub function. In ``hipLaunchByPtr``, the actual +kernel associated with the stub function is launched. NVCC implementation notes ================================================================================ @@ -199,7 +228,9 @@ NVCC implementation notes Interoperation between HIP and CUDA driver -------------------------------------------------------------------------------- -CUDA applications might want to mix CUDA driver code with HIP code (see the example below). This table shows the equivalence between CUDA and HIP types required to implement this interaction. +CUDA applications might want to mix CUDA driver code with HIP code (see the +example below). This table shows the equivalence between CUDA and HIP types +required to implement this interaction. .. list-table:: Equivalence table between HIP and CUDA types :header-rows: 1 @@ -547,3 +578,67 @@ The HIP version number is defined as an integer: .. code-block:: cpp HIP_VERSION=HIP_VERSION_MAJOR * 10000000 + HIP_VERSION_MINOR * 100000 + HIP_VERSION_PATCH + +******************************************************************************** +CU_POINTER_ATTRIBUTE_MEMORY_TYPE +******************************************************************************** + +To get the pointer's memory type in HIP, developers should use +:cpp:func:`hipPointerGetAttributes`. First parameter of the function is +`hipPointerAttribute_t`. Its ``type`` member variable indicates whether the +memory pointed to is allocated on the device or the host. + +For example: + +.. code-block:: cpp + + double * ptr; + hipMalloc(&ptr, sizeof(double)); + hipPointerAttribute_t attr; + hipPointerGetAttributes(&attr, ptr); /*attr.type is hipMemoryTypeDevice*/ + if(attr.type == hipMemoryTypeDevice) + std::cout << "ptr is of type hipMemoryTypeDevice" << std::endl; + + double* ptrHost; + hipHostMalloc(&ptrHost, sizeof(double)); + hipPointerAttribute_t attr; + hipPointerGetAttributes(&attr, ptrHost); /*attr.type is hipMemoryTypeHost*/ + if(attr.type == hipMemorTypeHost) + std::cout << "ptrHost is of type hipMemoryTypeHost" << std::endl; + +Note that ``hipMemoryType`` enum values are different from the +``cudaMemoryType`` enum values. + +For example, on AMD platform, `hipMemoryType` is defined in `hip_runtime_api.h`, + +.. code-block:: cpp + + typedef enum hipMemoryType { + hipMemoryTypeHost = 0, ///< Memory is physically located on host + hipMemoryTypeDevice = 1, ///< Memory is physically located on device. (see deviceId for specific device) + hipMemoryTypeArray = 2, ///< Array memory, physically located on device. (see deviceId for specific device) + hipMemoryTypeUnified = 3, ///< Not used currently + hipMemoryTypeManaged = 4 ///< Managed memory, automaticallly managed by the unified memory system + } hipMemoryType; + +Looking into CUDA toolkit, it defines `cudaMemoryType` as following, + +.. code-block:: cpp + + enum cudaMemoryType + { + cudaMemoryTypeUnregistered = 0, // Unregistered memory. + cudaMemoryTypeHost = 1, // Host memory. + cudaMemoryTypeDevice = 2, // Device memory. + cudaMemoryTypeManaged = 3, // Managed memory + } + +In this case, memory type translation for `hipPointerGetAttributes` needs to be handled properly on NVIDIA platform to get the correct memory type in CUDA, which is done in the file `nvidia_hip_runtime_api.h`. + +So in any HIP applications which use HIP APIs involving memory types, developers should use `#ifdef` in order to assign the correct enum values depending on NVIDIA or AMD platform. + +As an example, please see the code from the `link `_. + +With the `#ifdef` condition, HIP APIs work as expected on both AMD and NVIDIA platforms. + +Note, `cudaMemoryTypeUnregistered` is currently not supported as `hipMemoryType` enum, due to HIP functionality backward compatibility. diff --git a/docs/how-to/hip_porting_guide.rst b/docs/how-to/hip_porting_guide.rst index af248b3ec9..e4cf93d14c 100644 --- a/docs/how-to/hip_porting_guide.rst +++ b/docs/how-to/hip_porting_guide.rst @@ -14,10 +14,22 @@ suggestions on how to port CUDA code and work through common issues. Porting a CUDA Project ******************************************************************************** +Mixing HIP and CUDA code results in valid CUDA code. This enables users to +incrementally port CUDA to HIP, and still compile and test the code during the +transition. + +The only notable exception is ``hipError_t``, which is not just an alias to +``cudaError_t``. In these cases HIP provides functions to convert between the +error code spaces: + +:cpp:func:`hipErrorToCudaError` +:cpp:func:`hipErrorToCUResult` +:cpp:func:`hipCUDAErrorTohipError` +:cpp:func:`hipCUResultTohipError` + General Tips ================================================================================ -* You can incrementally port pieces of the code to HIP while leaving the rest in CUDA. HIP is just a thin layer over CUDA, so the two languages can interoperate. * Starting to port on an NVIDIA machine is often the easiest approach, as the code can be tested for functionality and performance even if not fully ported to HIP. * Once the CUDA code is ported to HIP and is running on the CUDA machine, compile the HIP code for an AMD machine. * You can handle platform-specific features through conditional compilation or by adding them to the open-source HIP infrastructure. @@ -533,16 +545,6 @@ supports, together with the corresponding macros and device properties. - ``hasDynamicParallelism`` - Ability to launch a kernel from within a kernel -******************************************************************************** -Finding HIP -******************************************************************************** - -Makefiles can use the following syntax to conditionally provide a default HIP_PATH if one does not exist: - -.. code-block:: shell - - HIP_PATH ?= $(shell hipconfig --path) - ******************************************************************************** Compilation ******************************************************************************** @@ -555,6 +557,12 @@ options are appropriate for the target compiler. ``hipconfig`` is a helpful tool in identifying the current systems platform, compiler and runtime. It can also help set options appropriately. +As an example, it can provide a path to HIP, in Makefiles for example: + +.. code-block:: shell + + HIP_PATH ?= $(shell hipconfig --path) + HIP Headers ================================================================================ @@ -602,3 +610,41 @@ platforms and architectures. The ``warpSize`` built-in should be used in device code, while the host can query it during runtime via the device properties. See the :ref:`HIP language extension for warpSize ` for information on how to write portable wave-aware code. + +******************************************************************************** +Porting from CUDA __launch_bounds__ +******************************************************************************** + +CUDA also defines a ``__launch_bounds__`` qualifier which works similar to HIP's +implementation, however it uses different parameters: + +.. code-block:: cpp + + __launch_bounds__(MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MULTIPROCESSOR) + +The first parameter is the same as HIP's implementation, but +``MIN_BLOCKS_PER_MULTIPROCESSOR`` must be converted to +``MIN_WARPS_PER_EXECUTION``, which uses warps and execution units rather than +blocks and multiprocessors. This conversion is performed automatically by +:doc:`HIPIFY `, or can be done manually with the following +equation. + +.. code-block:: cpp + + MIN_WARPS_PER_EXECUTION_UNIT = (MIN_BLOCKS_PER_MULTIPROCESSOR * MAX_THREADS_PER_BLOCK) / warpSize + +Directly controlling the warps per execution unit makes it easier to reason +about the occupancy, unlike with blocks, where the occupancy depends on the +block size. + +The use of execution units rather than multiprocessors also provides support for +architectures with multiple execution units per multiprocessor. For example, the +AMD GCN architecture has 4 execution units per multiprocessor. + +maxregcount +================================================================================ + +Unlike ``nvcc``, ``amdclang++`` does not support the ``--maxregcount`` option. +Instead, users are encouraged to use the ``__launch_bounds__`` directive since +the parameters are more intuitive and portable than micro-architecture details +like registers. The directive allows per-kernel control. From 4d07b7634da732829293369ab72adead56b5d700 Mon Sep 17 00:00:00 2001 From: Istvan Kiss Date: Fri, 14 Feb 2025 07:50:47 +0100 Subject: [PATCH 06/32] Minor fix --- docs/how-to/hip_porting_driver_api.rst | 17 ++++--- docs/how-to/hip_porting_guide.rst | 63 +++++++++++++------------- 2 files changed, 42 insertions(+), 38 deletions(-) diff --git a/docs/how-to/hip_porting_driver_api.rst b/docs/how-to/hip_porting_driver_api.rst index 41a7aff497..7d7ebbc24d 100644 --- a/docs/how-to/hip_porting_driver_api.rst +++ b/docs/how-to/hip_porting_driver_api.rst @@ -579,9 +579,8 @@ The HIP version number is defined as an integer: HIP_VERSION=HIP_VERSION_MAJOR * 10000000 + HIP_VERSION_MINOR * 100000 + HIP_VERSION_PATCH -******************************************************************************** CU_POINTER_ATTRIBUTE_MEMORY_TYPE -******************************************************************************** +================================================================================ To get the pointer's memory type in HIP, developers should use :cpp:func:`hipPointerGetAttributes`. First parameter of the function is @@ -633,12 +632,18 @@ Looking into CUDA toolkit, it defines `cudaMemoryType` as following, cudaMemoryTypeManaged = 3, // Managed memory } -In this case, memory type translation for `hipPointerGetAttributes` needs to be handled properly on NVIDIA platform to get the correct memory type in CUDA, which is done in the file `nvidia_hip_runtime_api.h`. +In this case, memory type translation for ``hipPointerGetAttributes`` needs to +be handled properly on NVIDIA platform to get the correct memory type in CUDA, +which is done in the file ``nvidia_hip_runtime_api.h``. -So in any HIP applications which use HIP APIs involving memory types, developers should use `#ifdef` in order to assign the correct enum values depending on NVIDIA or AMD platform. +So in any HIP applications which use HIP APIs involving memory types, developers +should use ``#ifdef`` in order to assign the correct enum values depending on +NVIDIA or AMD platform. As an example, please see the code from the `link `_. -With the `#ifdef` condition, HIP APIs work as expected on both AMD and NVIDIA platforms. +With the ``#ifdef`` condition, HIP APIs work as expected on both AMD and NVIDIA +platforms. -Note, `cudaMemoryTypeUnregistered` is currently not supported as `hipMemoryType` enum, due to HIP functionality backward compatibility. +Note, ``cudaMemoryTypeUnregistered`` is currently not supported as +``hipMemoryType`` enum, due to HIP functionality backward compatibility. diff --git a/docs/how-to/hip_porting_guide.rst b/docs/how-to/hip_porting_guide.rst index e4cf93d14c..136084f66b 100644 --- a/docs/how-to/hip_porting_guide.rst +++ b/docs/how-to/hip_porting_guide.rst @@ -2,17 +2,16 @@ :description: This chapter presents how to port CUDA source code to HIP. :keywords: AMD, ROCm, HIP, CUDA, porting, port -################################################################################ +******************************************************************************** HIP porting guide -################################################################################ +******************************************************************************** HIP is designed to ease the porting of existing CUDA code into the HIP environment. This page describes the available tools and provides practical suggestions on how to port CUDA code and work through common issues. -******************************************************************************** Porting a CUDA Project -******************************************************************************** +================================================================================ Mixing HIP and CUDA code results in valid CUDA code. This enables users to incrementally port CUDA to HIP, and still compile and test the code during the @@ -22,21 +21,26 @@ The only notable exception is ``hipError_t``, which is not just an alias to ``cudaError_t``. In these cases HIP provides functions to convert between the error code spaces: -:cpp:func:`hipErrorToCudaError` -:cpp:func:`hipErrorToCUResult` -:cpp:func:`hipCUDAErrorTohipError` -:cpp:func:`hipCUResultTohipError` +* :cpp:func:`hipErrorToCudaError` +* :cpp:func:`hipErrorToCUResult` +* :cpp:func:`hipCUDAErrorTohipError` +* :cpp:func:`hipCUResultTohipError` General Tips -================================================================================ +-------------------------------------------------------------------------------- -* Starting to port on an NVIDIA machine is often the easiest approach, as the code can be tested for functionality and performance even if not fully ported to HIP. -* Once the CUDA code is ported to HIP and is running on the CUDA machine, compile the HIP code for an AMD machine. -* You can handle platform-specific features through conditional compilation or by adding them to the open-source HIP infrastructure. -* Use the `HIPIFY `_ tools to automatically convert CUDA code to HIP, as described in the following section. +* Starting to port on an NVIDIA machine is often the easiest approach, as the + code can be tested for functionality and performance even if not fully ported + to HIP. +* Once the CUDA code is ported to HIP and is running on the CUDA machine, + compile the HIP code for an AMD machine. +* You can handle platform-specific features through conditional compilation or + by adding them to the open-source HIP infrastructure. +* Use the `HIPIFY `_ tools to automatically + convert CUDA code to HIP, as described in the following section. HIPIFY -================================================================================ +-------------------------------------------------------------------------------- :doc:`HIPIFY ` is a collection of tools that automatically translate CUDA to HIP code. There are two flavours available, ``hipfiy-clang`` @@ -126,7 +130,7 @@ hipified code to ``stdout``. directories. Library Equivalents -================================================================================ +-------------------------------------------------------------------------------- ROCm provides libraries to ease porting of code relying on CUDA libraries. Most CUDA libraries have a corresponding HIP library. @@ -213,12 +217,11 @@ corresponding CUDA library, depending on the platform, to provide compatibility. - Communications Primitives Library based on the MPI equivalents RCCL is a drop-in replacement for NCCL -******************************************************************************** Distinguishing compilers and platforms -******************************************************************************** +================================================================================ Identifying the HIP Target Platform -================================================================================ +-------------------------------------------------------------------------------- HIP projects can target either the AMD or NVIDIA platform. The platform affects which backend-headers are included and which libraries are used for linking. The @@ -388,9 +391,8 @@ for the device. .. _identifying_device_architecture_features: -******************************************************************************** Identifying Device Architecture Features -******************************************************************************** +================================================================================ GPUs of different generations and architectures do not all provide the same level of :doc:`hardware feature support <../reference/hardware_features>`. To @@ -398,7 +400,7 @@ guard device-code using these architecture dependent features, the ``__HIP_ARCH___`` C++-macros can be used. Device Code Feature Identification -================================================================================ +-------------------------------------------------------------------------------- Some CUDA code tests ``__CUDA_ARCH__`` for a specific value to determine whether the GPU supports a certain architectural feature, depending on its compute @@ -422,7 +424,7 @@ For host code, the ``__HIP_ARCH___`` defines are set to 0, if upon in host code. Host Code Feature Identification -================================================================================ +-------------------------------------------------------------------------------- Host code must not rely on the ``__HIP_ARCH___`` macros, as the GPUs available to a system can not be known during compile time, and their @@ -464,7 +466,7 @@ Host code can query architecture feature flags during runtime, by using } Table of Architecture Properties -================================================================================ +-------------------------------------------------------------------------------- The table below shows the full set of architectural properties that HIP supports, together with the corresponding macros and device properties. @@ -545,9 +547,8 @@ supports, together with the corresponding macros and device properties. - ``hasDynamicParallelism`` - Ability to launch a kernel from within a kernel -******************************************************************************** Compilation -******************************************************************************** +================================================================================ ``hipcc`` is a portable compiler driver that calls ``nvcc`` or ``amdclang++`` and forwards the appropriate options. It passes options through @@ -564,7 +565,7 @@ As an example, it can provide a path to HIP, in Makefiles for example: HIP_PATH ?= $(shell hipconfig --path) HIP Headers -================================================================================ +-------------------------------------------------------------------------------- The ``hip_runtime.h`` headers define all the necessary types, functions, macros, etc., needed to compile a HIP program, this includes host as well as device @@ -575,7 +576,7 @@ need to convert hipified code to include the richer ``hip_runtime.h`` instead of ``hip_runtime_api.h``. Using a Standard C++ Compiler -================================================================================ +-------------------------------------------------------------------------------- You can compile ``hip_runtime_api.h`` using a standard C or C++ compiler (e.g., ``gcc`` or ``icc``). @@ -601,9 +602,8 @@ The ``hipify`` tool automatically converts ``cuda_runtime.h`` to ``hip_runtime.h``, and it converts ``cuda_runtime_api.h`` to ``hip_runtime_api.h``, but it may miss nested headers or macros. -******************************************************************************** warpSize -******************************************************************************** +================================================================================ Code should not assume a warp size of 32 or 64, as that is not portable between platforms and architectures. The ``warpSize`` built-in should be used in device @@ -611,9 +611,8 @@ code, while the host can query it during runtime via the device properties. See the :ref:`HIP language extension for warpSize ` for information on how to write portable wave-aware code. -******************************************************************************** Porting from CUDA __launch_bounds__ -******************************************************************************** +================================================================================ CUDA also defines a ``__launch_bounds__`` qualifier which works similar to HIP's implementation, however it uses different parameters: @@ -642,7 +641,7 @@ architectures with multiple execution units per multiprocessor. For example, the AMD GCN architecture has 4 execution units per multiprocessor. maxregcount -================================================================================ +-------------------------------------------------------------------------------- Unlike ``nvcc``, ``amdclang++`` does not support the ``--maxregcount`` option. Instead, users are encouraged to use the ``__launch_bounds__`` directive since From 5defe21c51fb0705a02f0147c535e4e7eecaea36 Mon Sep 17 00:00:00 2001 From: Adel Johar Date: Fri, 14 Feb 2025 14:27:17 +0100 Subject: [PATCH 07/32] Docs: Update environment variables file --- docs/data/env_variables_hip.rst | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/docs/data/env_variables_hip.rst b/docs/data/env_variables_hip.rst index 6186671ecf..4192db7387 100644 --- a/docs/data/env_variables_hip.rst +++ b/docs/data/env_variables_hip.rst @@ -2,6 +2,9 @@ :description: HIP environment variables :keywords: AMD, HIP, environment variables, environment +HIP GPU isolation variables +-------------------------------------------------------------------------------- + The GPU isolation environment variables in HIP are collected in the following table. .. _hip-env-isolation: @@ -24,6 +27,9 @@ The GPU isolation environment variables in HIP are collected in the following ta | Device indices exposed to HIP applications. - Example: ``0,2`` +HIP profiling variables +-------------------------------------------------------------------------------- + The profiling environment variables in HIP are collected in the following table. .. _hip-env-prof: @@ -50,6 +56,9 @@ The profiling environment variables in HIP are collected in the following table. - | 0: Disable | 1: Enable +HIP debug variables +-------------------------------------------------------------------------------- + The debugging environment variables in HIP are collected in the following table. .. _hip-env-debug: @@ -149,6 +158,9 @@ The debugging environment variables in HIP are collected in the following table. number does not apply to hardware queues that are created for CU-masked HIP streams, or cooperative queues for HIP Cooperative Groups (single queue per device). +HIP memory management related variables +-------------------------------------------------------------------------------- + The memory management related environment variables in HIP are collected in the following table. @@ -245,6 +257,9 @@ following table. - | 0: Disable | 1: Enable +HIP miscellaneous variables +-------------------------------------------------------------------------------- + The following table lists environment variables that are useful but relate to different features in HIP. From f1cca4606d6c1ea5089aecf772c6216a30ae0b08 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 20 Feb 2025 00:45:05 +0000 Subject: [PATCH 08/32] Bump rocm-docs-core[api_reference] from 1.15.0 to 1.17.0 in /docs/sphinx Bumps [rocm-docs-core[api_reference]](https://github.com/ROCm/rocm-docs-core) from 1.15.0 to 1.17.0. - [Release notes](https://github.com/ROCm/rocm-docs-core/releases) - [Changelog](https://github.com/ROCm/rocm-docs-core/blob/develop/CHANGELOG.md) - [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.15.0...v1.17.0) --- updated-dependencies: - dependency-name: rocm-docs-core[api_reference] dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- docs/sphinx/requirements.in | 2 +- docs/sphinx/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in index 3a34f77e71..f7fb1c634a 100644 --- a/docs/sphinx/requirements.in +++ b/docs/sphinx/requirements.in @@ -1,2 +1,2 @@ -rocm-docs-core[api_reference]==1.15.0 +rocm-docs-core[api_reference]==1.17.0 sphinxcontrib.doxylink diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index e88b29e61e..3707dd92a3 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -211,7 +211,7 @@ requests==2.32.3 # via # pygithub # sphinx -rocm-docs-core[api-reference]==1.15.0 +rocm-docs-core[api-reference]==1.17.0 # via -r requirements.in rpds-py==0.22.3 # via From a593bf9ac33945422343bf18cfb97e23dff7c8d5 Mon Sep 17 00:00:00 2001 From: Istvan Kiss Date: Fri, 7 Feb 2025 14:11:52 +0100 Subject: [PATCH 09/32] Remove release.md and remove more-info section from readme --- README.md | 27 ------- RELEASE.md | 216 ----------------------------------------------------- 2 files changed, 243 deletions(-) delete mode 100644 RELEASE.md diff --git a/README.md b/README.md index 4df0b4c6a9..ed6db9581b 100644 --- a/README.md +++ b/README.md @@ -36,33 +36,6 @@ HIP releases are typically naming convention for each ROCM release to help diffe * rocm x.yy: These are the stable releases based on the ROCM release. This type of release is typically made once a month.* -## More Info - -* [Installation](docs/install/install.rst) -* [HIP FAQ](docs/faq.rst) -* [HIP C++ Language Extensions](docs/reference/cpp_language_extensions.rst) -* [HIP Porting Guide](docs/how-to/hip_porting_guide.md) -* [HIP Porting Driver Guide](docs/how-to/hip_porting_driver_api.rst) -* [HIP Programming Guide](docs/programming_guide.rst) -* [HIP Logging](docs/how-to/logging.rst) -* [Building HIP From Source](docs/install/build.rst) -* [HIP Debugging](docs/how-to/debugging.rst) -* [HIP RTC](docs/how-to/hip_rtc.md) -* [HIP Terminology](docs/reference/terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/OpenCL) -* [HIPIFY](https://github.com/ROCm/HIPIFY/blob/amd-staging/README.md) -* Supported CUDA APIs: - * [Runtime API](https://github.com/ROCm/HIPIFY/blob/amd-staging/docs/reference/tables/CUDA_Runtime_API_functions_supported_by_HIP.md) - * [Driver API](https://github.com/ROCm/HIPIFY/blob/amd-staging/docs/reference/tables/CUDA_Driver_API_functions_supported_by_HIP.md) - * [cuComplex API](https://github.com/ROCm/HIPIFY/blob/amd-staging/reference/docs/tables/cuComplex_API_supported_by_HIP.md) - * [Device API](https://github.com/ROCm/HIPIFY/blob/amd-staging/docs/reference/tables/CUDA_Device_API_supported_by_HIP.md) - * [cuBLAS](https://github.com/ROCm/HIPIFY/blob/amd-staging/docs/reference/tables/CUBLAS_API_supported_by_ROC.md) - * [cuRAND](https://github.com/ROCm/HIPIFY/blob/amd-staging/docs/reference/tables/CURAND_API_supported_by_HIP.md) - * [cuDNN](https://github.com/ROCm/HIPIFY/blob/amd-staging/docs/reference/tables/CUDNN_API_supported_by_HIP.md) - * [cuFFT](https://github.com/ROCm/HIPIFY/blob/amd-staging/docs/reference/tables/CUFFT_API_supported_by_HIP.md) - * [cuSPARSE](https://github.com/ROCm/HIPIFY/blob/amd-staging/docs/reference/tables/CUSPARSE_API_supported_by_HIP.md) -* [Developer/CONTRIBUTING Info](CONTRIBUTING.md) -* [Release Notes](RELEASE.md) - ## How do I get set up? See the [Installation](docs/install/install.rst) notes. diff --git a/RELEASE.md b/RELEASE.md deleted file mode 100644 index 15fb221549..0000000000 --- a/RELEASE.md +++ /dev/null @@ -1,216 +0,0 @@ -# Release notes - -We have attempted to document known bugs and limitations - in particular the [HIP Kernel Language](docs/markdown/hip_kernel_language.md) document uses the phrase "Under Development", and the [HIP Runtime API issue list](https://github.com/ROCm/HIP/issues) lists known bugs. - - -=================================================================================================== - - -## Revision History: - -=================================================================================================== -Release: 1.5 -Date: -- Support threadIdx, blockIdx, blockDim directly (no need for hipify conversions in kernels.) HIP - Kernel syntax is now identical to CUDA kernel syntax - no need for extra parms or conversions. -- Refactor launch syntax. HIP now extracts kernels from the executable and launches them using the - existing module interface. Kernels dispatch no longer flows through HCC. Result is faster - kernel launches and with less resource usage (no signals required). -- Remove requirement for manual "serializers" previously required when passing complex structures - into kernels. -- Remove need for manual destructors -- Provide printf in device code -- Support for globals when using module API -- hipify-clang now supports using newer versions of clang -- HIP texture support equivalent to CUDA texture driver APIs -- Updates to hipify-perl, hipify-clang and documentation - - -=================================================================================================== -Release: 1.4 -Date: 2017.10.06 -- Improvements to HIP event management -- Added new HIP_TRACE_API options -- Enabled device side assert support -- Several bug fixes including hipMallocArray, hipTexture fetch -- Support for RHEL/CentOS 7.4 -- Updates to hipify-perl, hipify-clang and documentation - - -=================================================================================================== -Release: 1.3 -Date: 2017.08.16 -- hipcc now auto-detects amdgcn arch. No need to specify the arch when building for same system. -- HIP texture support (run-time APIs) -- Implemented __threadfence_support -- Improvements in HIP context management logic -- Bug fixes in several APIs including hipDeviceGetPCIBusId, hipEventDestroy, hipMemcpy2DAsync -- Updates to hipify-clang and documentation -- HIP development now fully open and on GitHub. Developers should submit pull requests. - - -=================================================================================================== -Release: 1.2 -Date: 2017.06.29 -- new APIs: hipMemcpy2DAsync, hipMallocPitch, hipHostMallocCoherent, hipHostMallocNonCoherent -- added support for building hipify-clang using clang 3.9 -- hipify-clang updates for CUDA 8.0 runtime+driver support -- renamed hipify to hipify-perl -- initial implementation of hipify-cmakefile -- several documentation updates & bug fixes -- support for abort() function in device code - - -=================================================================================================== -Release: 1.0.17102 -Date: 2017.03.07 -- Lots of improvements to hipify-clang. -- Added HIP package config for cmake. -- Several bug fixes and documentation updates. - - -=================================================================================================== -Release: 1.0.17066 -Date: 2017.02.11 -- Improved support for math device functions. -- Added several half math device functions. -- Enabled support for CUDA 8.0 in hipify-clang. -- Lots of bug fixes and documentation updates. - - -=================================================================================================== -Release: 1.0.17015 -Date: 2017.01.06 -- Several improvements to the hipify-clang infrastructure. -- Refactored module and function APIs. -- HIP now defaults to linking against the shared runtime library. -- Documentation updates. - - -=================================================================================================== -Release: 1.0.16502 -Date: 2016.12.13 -- Added several fast math and packaged math instrincs -- Improved debug and profiler documentation -- Support for building and linking to HIP shared library -- Several improvements to hipify-clang -- Several bug fixes - - -=================================================================================================== -Release: 1.0.16461 -Date: 2016.11.14 -- Significant changes to the HIP Profiling APIs. Refer to the documentation for details -- Improvements to P2P support -- New API: hipDeviceGetByPCIBusId -- Several bug fixes in NV path -- hipModuleLaunch now works for multi-dim kernels - - -=================================================================================================== -Release:1.0 -Date: 2016.11.8 -- Initial implementation for FindHIP.cmake -- HIP library now installs as a static library by default -- Added support for HIP context and HIP module APIs -- Major changes to HIP signal & memory management implementation -- Support for complex data type and math functions -- clang-hipify is now known as hipify-clang -- Added several new HIP samples -- Preliminary support for new APIs: hipMemcpyToSymbol, hipDeviceGetLimit, hipRuntimeGetVersion -- Added support for async memcpy driver API (for example hipMemcpyHtoDAsync) -- Support for memory management device functions: malloc, free, memcpy & memset -- Removed deprecated HIP runtime header locations. Please include "hip/hip_runtime.h" instead of "hip_runtime.h". You can use `find . -type f -exec sed -i 's:#include "hip_runtime.h":#include "hip/hip_runtime.h":g' {} +` to replace all such references - - -=================================================================================================== -Release:0.92.00 -Date: 2016.8.14 -- hipLaunchKernel supports one-dimensional grid and/or block dims, without explicit cast to dim3 type (actually in 0.90.00) -- fp16 software support -- Support for Hawaii dGPUs using environment variable ROCM_TARGET=hawaii -- Support hipArray -- Improved profiler support -- Documentation updates -- Improvements to clang-hipify - - -=================================================================================================== -Release:0.90.00 -Date: 2016.06.29 -- Support dynamic shared memory allocations -- Min HCC compiler version is > 16186. -- Expanded math functions (device and host). Document unsupported functions. -- hipFree with null pointer initializes runtime and returns success. -- Improve error code reporting on nvcc. -- Add hipPeekAtError for nvcc. - - -=================================================================================================== -Release:0.86.00 -Date: 2016.06.06 -- Add clang-hipify : clang-based hipify tool. Improved parsing of source code, and automates - creation of hipLaunchParm variable. -- Implement memory register / unregister commands (hipHostRegister, hipHostUnregister) -- Add cross-linking support between G++ and HCC, in particular for interfaces that use - standard C++ libraries (ie std::vectors, std::strings). HIPCC now uses libstdc++ by default on the HCC - compilation path. -- More samples including gpu-burn, SHOC, nbody, rtm. See [HIP-Examples](https://github.com/ROCm/HIP-Examples) - - -=================================================================================================== -Release:0.84.01 -Date: 2016.04.25 -- Refactor HIP make and install system: - - Move to CMake. Refer to the installation section in README.md for details. - - Split source into multiple modular .cpp and .h files. - - Create static library and link. - - Set HIP_PATH to install. -- Make hipDevice and hipStream thread-safe. - - Preferred hipStream usage is still to create new streams for each new thread, but it works even if you don;t. -- Improve automated platform detection: If AMD GPU is installed and detected by driver, default HIP_PLATFORM to hcc. -- HIP_TRACE_API now prints arguments to the HIP function (in addition to name of function). -- Deprecate hipDeviceGetProp (Replace with hipGetDeviceProp) -- Deprecate hipMallocHost (Replace with hipHostMalloc) -- Deprecate hipFreeHost (Replace with hipHostFree) -- The mixbench benchmark tool for measuring operational intensity now has a HIP target, in addition to CUDA and OpenCL. Let the comparisons begin. :) -See here for more : https://github.com/ekondis/mixbench. - - -=================================================================================================== -Release:0.82.00 -Date: 2016.03.07 -- Bump minimum required HCC workweek to 16074. -- Bump minimum required ROCK-Kernel-Driver and ROCR-Runtime to Developer Preview 2. -- Enable multi-GPU support. - * Use hipSetDevice to select a device for subsequent kernel calls and memory allocations. - * CUDA_VISIBLE_DEVICES / HIP_VISIBLE_DEVICE environment variable selects devices visible to the runtime. -- Support hipStreams – send sequences of copy and kernel commands to a device. - * Asynchronous copies supported. -- Optimize memory copy operations. -- Support hipPointerGetAttribute – can determine if a pointer is host or device. -- Enable atomics to local memory. -- Support for LC Direct-To-ISA path. -- Improved free memory reporting. - * hipMemGetInfo (report full memory used in current process). - * hipDeviceReset (deletes all memory allocated by current process). - - -=================================================================================================== -Release:0.80.01 -Date: 2016.02.18 -- Improve reporting and support for device-side math functions. -- Update Runtime Documentation. -- Improve implementations of cross-lane operations (_ballot, _any, _all). -- Provide shuffle intrinsics (performance optimization in-progress). -- Support hipDeviceAttribute for querying "one-shot" device attributes, as an alternative to hipGetDeviceProperties. - - -=================================================================================================== -Release:0.80.00 -Date: 2016.01.25 - -Initial release with GPUOpen Launch. - - - From dcf0971bc2ec5c5ed6860bdbc2160fb3f9068157 Mon Sep 17 00:00:00 2001 From: Adel Johar Date: Wed, 5 Feb 2025 15:30:11 +0100 Subject: [PATCH 10/32] Docs: Update FP8 page to show both FP8 and FP16 types --- .wordlist.txt | 1 + docs/index.md | 2 +- docs/reference/fp8_numbers.rst | 230 ---------------- docs/reference/low_fp_types.rst | 470 ++++++++++++++++++++++++++++++++ docs/sphinx/_toc.yml.in | 4 +- 5 files changed, 474 insertions(+), 233 deletions(-) delete mode 100644 docs/reference/fp8_numbers.rst create mode 100644 docs/reference/low_fp_types.rst diff --git a/.wordlist.txt b/.wordlist.txt index de7c91b31a..009c63b73f 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -9,6 +9,7 @@ AXPY asm asynchrony backtrace +bfloat Bitcode bitcode bitcodes diff --git a/docs/index.md b/docs/index.md index 247c58e2fd..23f352e306 100644 --- a/docs/index.md +++ b/docs/index.md @@ -46,7 +46,7 @@ The HIP documentation is organized into the following categories: * [HIP environment variables](./reference/env_variables) * [CUDA to HIP API Function Comparison](./reference/api_syntax) * [List of deprecated APIs](./reference/deprecated_api_list) -* [FP8 numbers in HIP](./reference/fp8_numbers) +* [Low Precision Floating Point Types](./reference/low_fp_types) * {doc}`./reference/hardware_features` ::: diff --git a/docs/reference/fp8_numbers.rst b/docs/reference/fp8_numbers.rst deleted file mode 100644 index 248ae21ff3..0000000000 --- a/docs/reference/fp8_numbers.rst +++ /dev/null @@ -1,230 +0,0 @@ -.. meta:: - :description: This page describes FP8 numbers present in HIP. - :keywords: AMD, ROCm, HIP, fp8, fnuz, ocp - -******************************************************************************* -FP8 Numbers -******************************************************************************* - -`FP8 numbers `_ were introduced to accelerate deep learning inferencing. They provide higher throughput of matrix operations because the smaller size allows more of them in the available fixed memory. - -HIP has two FP8 number representations called *FP8-OCP* and *FP8-FNUZ*. - -Open Compute Project(OCP) number definition can be found `here `_. - -Definition of FNUZ: fnuz suffix means only finite and NaN values are supported. Unlike other types, Inf are not supported. -NaN is when sign bit is set and all other exponent and mantissa bits are 0. All other values are finite. -This provides one extra value of exponent and adds to the range of supported FP8 numbers. - -FP8 Definition -============== - -FP8 numbers are composed of a sign, an exponent and a mantissa. Their sizes are dependent on the format. -There are two formats of FP8 numbers, E4M3 and E5M2. - -- E4M3: 1 bit sign, 4 bit exponent, 3 bit mantissa -- E5M2: 1 bit sign, 5 bit exponent, 2 bit mantissa - -HIP Header -========== - -The `HIP header `_ defines the FP8 ocp/fnuz numbers. - -Supported Devices -================= - -.. list-table:: Supported devices for fp8 numbers - :header-rows: 1 - - * - Device Type - - FNUZ FP8 - - OCP FP8 - * - Host - - Yes - - Yes - * - gfx942 - - Yes - - No - * - gfx1200/gfx1201 - - No - - Yes - -Usage -===== - -To use the FP8 numbers inside HIP programs. - -.. code-block:: c - - #include - -FP8 numbers can be used on CPU side: - -.. code-block:: c - - __hip_fp8_storage_t convert_float_to_fp8( - float in, /* Input val */ - __hip_fp8_interpretation_t interpret, /* interpretation of number E4M3/E5M2 */ - __hip_saturation_t sat /* Saturation behavior */ - ) { - return __hip_cvt_float_to_fp8(in, sat, interpret); - } - -The same can be done in kernels as well. - -.. code-block:: c - - __device__ __hip_fp8_storage_t d_convert_float_to_fp8( - float in, - __hip_fp8_interpretation_t interpret, - __hip_saturation_t sat) { - return __hip_cvt_float_to_fp8(in, sat, interpret); - } - -An important thing to note here is if you use this on gfx94x GPU, it will be fnuz number but on any other GPU it will be an OCP number. - -The following code example does roundtrip FP8 conversions on both the CPU and GPU and compares the results. - -.. code-block:: c - - #include - #include - #include - #include - - #define hip_check(hip_call) \ - { \ - auto hip_res = hip_call; \ - if (hip_res != hipSuccess) { \ - std::cerr << "Failed in hip call: " << #hip_call \ - << " with error: " << hipGetErrorName(hip_res) << std::endl; \ - std::abort(); \ - } \ - } - - __device__ __hip_fp8_storage_t d_convert_float_to_fp8( - float in, __hip_fp8_interpretation_t interpret, __hip_saturation_t sat) { - return __hip_cvt_float_to_fp8(in, sat, interpret); - } - - __device__ float d_convert_fp8_to_float(float in, - __hip_fp8_interpretation_t interpret) { - __half hf = __hip_cvt_fp8_to_halfraw(in, interpret); - return hf; - } - - __global__ void float_to_fp8_to_float(float *in, - __hip_fp8_interpretation_t interpret, - __hip_saturation_t sat, float *out, - size_t size) { - int i = threadIdx.x; - if (i < size) { - auto fp8 = d_convert_float_to_fp8(in[i], interpret, sat); - out[i] = d_convert_fp8_to_float(fp8, interpret); - } - } - - __hip_fp8_storage_t - convert_float_to_fp8(float in, /* Input val */ - __hip_fp8_interpretation_t - interpret, /* interpretation of number E4M3/E5M2 */ - __hip_saturation_t sat /* Saturation behavior */ - ) { - return __hip_cvt_float_to_fp8(in, sat, interpret); - } - - float convert_fp8_to_float( - __hip_fp8_storage_t in, /* Input val */ - __hip_fp8_interpretation_t - interpret /* interpretation of number E4M3/E5M2 */ - ) { - __half hf = __hip_cvt_fp8_to_halfraw(in, interpret); - return hf; - } - - int main() { - constexpr size_t size = 32; - hipDeviceProp_t prop; - hip_check(hipGetDeviceProperties(&prop, 0)); - bool is_supported = (std::string(prop.gcnArchName).find("gfx94") != std::string::npos) || // gfx94x - (std::string(prop.gcnArchName).find("gfx120") != std::string::npos); // gfx120x - if(!is_supported) { - std::cerr << "Need a gfx94x or gfx120x, but found: " << prop.gcnArchName << std::endl; - std::cerr << "No device conversions are supported, only host conversions are supported." << std::endl; - return -1; - } - - const __hip_fp8_interpretation_t interpret = (std::string(prop.gcnArchName).find("gfx94") != std::string::npos) - ? __HIP_E4M3_FNUZ // gfx94x - : __HIP_E4M3; // gfx120x - constexpr __hip_saturation_t sat = __HIP_SATFINITE; - - std::vector in; - in.reserve(size); - for (size_t i = 0; i < size; i++) { - in.push_back(i + 1.1f); - } - - std::cout << "Converting float to fp8 and back..." << std::endl; - // CPU convert - std::vector cpu_out; - cpu_out.reserve(size); - for (const auto &fval : in) { - auto fp8 = convert_float_to_fp8(fval, interpret, sat); - cpu_out.push_back(convert_fp8_to_float(fp8, interpret)); - } - - // GPU convert - float *d_in, *d_out; - hip_check(hipMalloc(&d_in, sizeof(float) * size)); - hip_check(hipMalloc(&d_out, sizeof(float) * size)); - - hip_check(hipMemcpy(d_in, in.data(), sizeof(float) * in.size(), - hipMemcpyHostToDevice)); - - float_to_fp8_to_float<<<1, size>>>(d_in, interpret, sat, d_out, size); - - std::vector gpu_out(size, 0.0f); - hip_check(hipMemcpy(gpu_out.data(), d_out, sizeof(float) * gpu_out.size(), - hipMemcpyDeviceToHost)); - - hip_check(hipFree(d_in)); - hip_check(hipFree(d_out)); - - // Validation - for (size_t i = 0; i < size; i++) { - if (cpu_out[i] != gpu_out[i]) { - std::cerr << "cpu round trip result: " << cpu_out[i] - << " - gpu round trip result: " << gpu_out[i] << std::endl; - std::abort(); - } - } - std::cout << "...CPU and GPU round trip convert matches." << std::endl; - } - -There are C++ style classes available as well. - -.. code-block:: c - - __hip_fp8_e4m3_fnuz fp8_val(1.1f); // gfx94x - __hip_fp8_e4m3 fp8_val(1.1f); // gfx120x - -Each type of FP8 number has its own class: - -- __hip_fp8_e4m3 -- __hip_fp8_e5m2 -- __hip_fp8_e4m3_fnuz -- __hip_fp8_e5m2_fnuz - -There is support of vector of FP8 types. - -- __hip_fp8x2_e4m3: holds 2 values of OCP FP8 e4m3 numbers -- __hip_fp8x4_e4m3: holds 4 values of OCP FP8 e4m3 numbers -- __hip_fp8x2_e5m2: holds 2 values of OCP FP8 e5m2 numbers -- __hip_fp8x4_e5m2: holds 4 values of OCP FP8 e5m2 numbers -- __hip_fp8x2_e4m3_fnuz: holds 2 values of FP8 fnuz e4m3 numbers -- __hip_fp8x4_e4m3_fnuz: holds 4 values of FP8 fnuz e4m3 numbers -- __hip_fp8x2_e5m2_fnuz: holds 2 values of FP8 fnuz e5m2 numbers -- __hip_fp8x4_e5m2_fnuz: holds 4 values of FP8 fnuz e5m2 numbers - -FNUZ extensions will be available on gfx94x only. diff --git a/docs/reference/low_fp_types.rst b/docs/reference/low_fp_types.rst new file mode 100644 index 0000000000..7fe450a35f --- /dev/null +++ b/docs/reference/low_fp_types.rst @@ -0,0 +1,470 @@ +.. meta:: + :description: This page describes the FP8 and FP16 types present in HIP. + :keywords: AMD, ROCm, HIP, fp8, fnuz, ocp + +******************************************************************************* +Low precision floating point types +******************************************************************************* + +Modern computing tasks often require balancing numerical precision against hardware resources +and processing speed. Low precision floating point number formats in HIP include FP8 (Quarter Precision) +and FP16 (Half Precision), which reduce memory and bandwidth requirements compared to traditional +32-bit or 64-bit formats. The following sections detail their specifications, variants, and provide +practical guidance for implementation in HIP. + +FP8 (Quarter Precision) +======================= + +`FP8 (Floating Point 8-bit) numbers `_ were introduced +as a compact numerical format specifically tailored for deep learning inference. By reducing +precision while maintaining computational effectiveness, FP8 allows for significant memory +savings and improved processing speed. This makes it particularly beneficial for deploying +large-scale models with strict efficiency constraints. + +Unlike traditional floating-point formats such as FP32 or even FP16, FP8 further optimizes +performance by enabling a higher volume of matrix operations per second. Its reduced bit-width +minimizes bandwidth requirements, making it an attractive choice for hardware accelerators +in deep learning applications. + +There are two primary FP8 formats: + +- **E4M3 Format** + + - Sign: 1 bit + - Exponent: 4 bits + - Mantissa: 3 bits + +- **E5M2 Format** + + - Sign: 1 bit + - Exponent: 5 bits + - Mantissa: 2 bits + +The E4M3 format offers higher precision with a narrower range, while the E5M2 format provides +a wider range at the cost of some precision. + +Additionally, FP8 numbers have two representations: + +- **FP8-OCP (Open Compute Project)** + + - `This `_ + is a standardized format developed by the Open Compute Project to ensure compatibility + across various hardware and software implementations. + +- **FP8-FNUZ (Finite and NaN Only)** + + - A specialized format optimized for specific computations, supporting only finite and NaN values + (no Inf support). + - This provides one extra value of exponent and adds to the range of supported FP8 numbers. + - **NaN Definition**: When the sign bit is set, and all other exponent and mantissa bits are zero. + +The FNUZ representation provides an extra exponent value, expanding the range of representable +numbers compared to standard FP8 formats. + + +HIP Header +---------- + +The `HIP FP8 header `_ +defines the FP8 ocp/fnuz numbers. + +Supported Devices +----------------- + +Different GPU models support different FP8 formats. Here's a breakdown: + +.. list-table:: Supported devices for fp8 numbers + :header-rows: 1 + + * - Device Type + - FNUZ FP8 + - OCP FP8 + * - Host + - Yes + - Yes + * - CDNA1 + - No + - No + * - CDNA2 + - No + - No + * - CDNA3 + - Yes + - No + * - RDNA2 + - No + - No + * - RDNA3 + - No + - No + +Using FP8 Numbers in HIP Programs +--------------------------------- + +To use the FP8 numbers inside HIP programs. + +.. code-block:: cpp + + #include + +FP8 numbers can be used on CPU side: + +.. code-block:: cpp + + __hip_fp8_storage_t convert_float_to_fp8( + float in, /* Input val */ + __hip_fp8_interpretation_t interpret, /* interpretation of number E4M3/E5M2 */ + __hip_saturation_t sat /* Saturation behavior */ + ) { + return __hip_cvt_float_to_fp8(in, sat, interpret); + } + +The same can be done in kernels as well. + +.. code-block:: cpp + + __device__ __hip_fp8_storage_t d_convert_float_to_fp8( + float in, + __hip_fp8_interpretation_t interpret, + __hip_saturation_t sat) { + return __hip_cvt_float_to_fp8(in, sat, interpret); + } + +Note: On a gfx94x GPU, the type will default to the fnuz type. + +The following code example does roundtrip FP8 conversions on both the CPU and GPU and compares the results. + +.. code-block:: cpp + + #include + #include + #include + #include + + #define hip_check(hip_call) \ + { \ + auto hip_res = hip_call; \ + if (hip_res != hipSuccess) { \ + std::cerr << "Failed in HIP call: " << #hip_call \ + << " at " << __FILE__ << ":" << __LINE__ \ + << " with error: " << hipGetErrorString(hip_res) << std::endl; \ + std::abort(); \ + } \ + } + + __device__ __hip_fp8_storage_t d_convert_float_to_fp8( + float in, __hip_fp8_interpretation_t interpret, __hip_saturation_t sat) { + return __hip_cvt_float_to_fp8(in, sat, interpret); + } + + __device__ float d_convert_fp8_to_float(float in, + __hip_fp8_interpretation_t interpret) { + __half hf = __hip_cvt_fp8_to_halfraw(in, interpret); + return hf; + } + + __global__ void float_to_fp8_to_float(float *in, + __hip_fp8_interpretation_t interpret, + __hip_saturation_t sat, float *out, + size_t size) { + int i = threadIdx.x; + if (i < size) { + auto fp8 = d_convert_float_to_fp8(in[i], interpret, sat); + out[i] = d_convert_fp8_to_float(fp8, interpret); + } + } + + __hip_fp8_storage_t + convert_float_to_fp8(float in, /* Input val */ + __hip_fp8_interpretation_t + interpret, /* interpretation of number E4M3/E5M2 */ + __hip_saturation_t sat /* Saturation behavior */ + ) { + return __hip_cvt_float_to_fp8(in, sat, interpret); + } + + float convert_fp8_to_float( + __hip_fp8_storage_t in, /* Input val */ + __hip_fp8_interpretation_t + interpret /* interpretation of number E4M3/E5M2 */ + ) { + __half hf = __hip_cvt_fp8_to_halfraw(in, interpret); + return hf; + } + + int main() { + constexpr size_t size = 32; + hipDeviceProp_t prop; + hip_check(hipGetDeviceProperties(&prop, 0)); + bool is_supported = (std::string(prop.gcnArchName).find("gfx94") != std::string::npos); // gfx94x + if(!is_supported) { + std::cerr << "Need a gfx94x, but found: " << prop.gcnArchName << std::endl; + std::cerr << "No device conversions are supported, only host conversions are supported." << std::endl; + return -1; + } + + const __hip_fp8_interpretation_t interpret = (std::string(prop.gcnArchName).find("gfx94") != std::string::npos) + ? __HIP_E4M3_FNUZ // gfx94x + : __HIP_E4M3; + constexpr __hip_saturation_t sat = __HIP_SATFINITE; + + std::vector in; + in.reserve(size); + for (size_t i = 0; i < size; i++) { + in.push_back(i + 1.1f); + } + + std::cout << "Converting float to fp8 and back..." << std::endl; + // CPU convert + std::vector cpu_out; + cpu_out.reserve(size); + for (const auto &fval : in) { + auto fp8 = convert_float_to_fp8(fval, interpret, sat); + cpu_out.push_back(convert_fp8_to_float(fp8, interpret)); + } + + // GPU convert + float *d_in, *d_out; + hip_check(hipMalloc(&d_in, sizeof(float) * size)); + hip_check(hipMalloc(&d_out, sizeof(float) * size)); + + hip_check(hipMemcpy(d_in, in.data(), sizeof(float) * in.size(), + hipMemcpyHostToDevice)); + + float_to_fp8_to_float<<<1, size>>>(d_in, interpret, sat, d_out, size); + + std::vector gpu_out(size, 0.0f); + hip_check(hipMemcpy(gpu_out.data(), d_out, sizeof(float) * gpu_out.size(), + hipMemcpyDeviceToHost)); + + hip_check(hipFree(d_in)); + hip_check(hipFree(d_out)); + + // Validation + for (size_t i = 0; i < size; i++) { + if (cpu_out[i] != gpu_out[i]) { + std::cerr << "cpu round trip result: " << cpu_out[i] + << " - gpu round trip result: " << gpu_out[i] << std::endl; + std::abort(); + } + } + std::cout << "...CPU and GPU round trip convert matches." << std::endl; + } + +There are C++ style classes available as well. + +.. code-block:: cpp + + __hip_fp8_e4m3_fnuz fp8_val(1.1f); // gfx94x + __hip_fp8_e4m3 fp8_val(1.1f); + +Each type of FP8 number has its own class: + +- __hip_fp8_e4m3 +- __hip_fp8_e5m2 +- __hip_fp8_e4m3_fnuz +- __hip_fp8_e5m2_fnuz + +There is support of vector of FP8 types. + +- __hip_fp8x2_e4m3: holds 2 values of OCP FP8 e4m3 numbers +- __hip_fp8x4_e4m3: holds 4 values of OCP FP8 e4m3 numbers +- __hip_fp8x2_e5m2: holds 2 values of OCP FP8 e5m2 numbers +- __hip_fp8x4_e5m2: holds 4 values of OCP FP8 e5m2 numbers +- __hip_fp8x2_e4m3_fnuz: holds 2 values of FP8 fnuz e4m3 numbers +- __hip_fp8x4_e4m3_fnuz: holds 4 values of FP8 fnuz e4m3 numbers +- __hip_fp8x2_e5m2_fnuz: holds 2 values of FP8 fnuz e5m2 numbers +- __hip_fp8x4_e5m2_fnuz: holds 4 values of FP8 fnuz e5m2 numbers + +FNUZ extensions will be available on gfx94x only. + +FP16 (Half Precision) +===================== + +FP16 (Floating Point 16-bit) numbers offer a balance between precision and +efficiency, making them a widely adopted standard for accelerating deep learning +inference. With higher precision than FP8 but lower memory requirements than FP32, +FP16 enables faster computations while preserving model accuracy. + +Deep learning workloads often involve massive datasets and complex calculations, +making FP32 computationally expensive. FP16 helps mitigate these costs by reducing +storage and bandwidth demands, allowing for increased throughput without significant +loss of numerical stability. This format is particularly useful for training and +inference in GPUs and TPUs optimized for half-precision arithmetic. + +There are two primary FP16 formats: + +- **float16 Format** + + - Sign: 1 bit + - Exponent: 5 bits + - Mantissa: 10 bits + +- **bfloat16 Format** + + - Sign: 1 bit + - Exponent: 8 bits + - Mantissa: 7 bits + +The float16 format offers higher precision with a narrower range, while the bfloat16 +format provides a wider range at the cost of some precision. + +Additionally, FP16 numbers have standardized representations developed by industry +initiatives to ensure compatibility across various hardware and software implementations. +Unlike FP8, which has specific representations like OCP and FNUZ, FP16 is more uniformly +supported with its two main formats, float16 and bfloat16. + +HIP Header +---------- + +The `HIP FP16 header `_ +defines the float16 format. + +The `HIP BF16 header `_ +defines the bfloat16 format. + +Supported Devices +----------------- + +Different GPU models support different FP16 formats. Here's a breakdown: + +.. list-table:: Supported devices for fp16 numbers + :header-rows: 1 + + * - Device Type + - float16 + - bfloat16 + * - Host + - Yes + - Yes + * - CDNA1 + - Yes + - Yes + * - CDNA2 + - Yes + - Yes + * - CDNA3 + - Yes + - Yes + * - RDNA2 + - Yes + - Yes + * - RDNA3 + - Yes + - Yes + +Using FP16 Numbers in HIP Programs +---------------------------------- + +To use the FP16 numbers inside HIP programs. + +.. code-block:: cpp + + #include // for float16 + #include // for bfloat16 + +The following code example adds two float16 values on the GPU and compares the results +against summed float values on the CPU. + +.. code-block:: cpp + + #include + #include + #include + #include + + #define hip_check(hip_call) \ + { \ + auto hip_res = hip_call; \ + if (hip_res != hipSuccess) { \ + std::cerr << "Failed in HIP call: " << #hip_call \ + << " at " << __FILE__ << ":" << __LINE__ \ + << " with error: " << hipGetErrorString(hip_res) << std::endl; \ + std::abort(); \ + } \ + } + + __global__ void add_half_precision(__half* in1, __half* in2, float* out, size_t size) { + int idx = threadIdx.x; + if (idx < size) { + // Load as half, perform addition in float, store as float + float sum = __half2float(in1[idx] + in2[idx]); + out[idx] = sum; + } + } + + int main() { + constexpr size_t size = 32; + constexpr float tolerance = 1e-1f; // Allowable numerical difference + + // Initialize input vectors as floats + std::vector in1(size), in2(size); + for (size_t i = 0; i < size; i++) { + in1[i] = i + 1.1f; + in2[i] = i + 2.2f; + } + + // Compute expected results in full precision on CPU + std::vector cpu_out(size); + for (size_t i = 0; i < size; i++) { + cpu_out[i] = in1[i] + in2[i]; // Direct float addition + } + + // Allocate device memory (store input as half, output as float) + __half *d_in1, *d_in2; + float *d_out; + hip_check(hipMalloc(&d_in1, sizeof(__half) * size)); + hip_check(hipMalloc(&d_in2, sizeof(__half) * size)); + hip_check(hipMalloc(&d_out, sizeof(float) * size)); + + // Convert input to half and copy to device + std::vector<__half> in1_half(size), in2_half(size); + for (size_t i = 0; i < size; i++) { + in1_half[i] = __float2half(in1[i]); + in2_half[i] = __float2half(in2[i]); + } + + hip_check(hipMemcpy(d_in1, in1_half.data(), sizeof(__half) * size, hipMemcpyHostToDevice)); + hip_check(hipMemcpy(d_in2, in2_half.data(), sizeof(__half) * size, hipMemcpyHostToDevice)); + + // Launch kernel + add_half_precision<<<1, size>>>(d_in1, d_in2, d_out, size); + + // Copy result back to host + std::vector gpu_out(size, 0.0f); + hip_check(hipMemcpy(gpu_out.data(), d_out, sizeof(float) * size, hipMemcpyDeviceToHost)); + + // Free device memory + hip_check(hipFree(d_in1)); + hip_check(hipFree(d_in2)); + hip_check(hipFree(d_out)); + + // Validation with tolerance + for (size_t i = 0; i < size; i++) { + if (std::fabs(cpu_out[i] - gpu_out[i]) > tolerance) { + std::cerr << "Mismatch at index " << i << ": CPU result = " << cpu_out[i] + << ", GPU result = " << gpu_out[i] << std::endl; + std::abort(); + } + } + + std::cout << "Success: CPU and GPU half-precision addition match within tolerance!" << std::endl; + } + + +There are C++ style classes available as well. + +.. code-block:: cpp + + __half fp16_val(1.1f); // float16 + __hip_bfloat16 fp16_val(1.1f); // bfloat16 + +Each type of FP16 number has its own class: + +- __half +- __hip_bfloat16 + +There is support of vector of FP16 types. + +- __half2: holds 2 values of float16 numbers +- __hip_bfloat162: holds 2 values of bfloat16 numbers diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index 34050b2448..2f08ffcd5a 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -113,8 +113,8 @@ subtrees: - file: reference/api_syntax - file: reference/deprecated_api_list title: List of deprecated APIs - - file: reference/fp8_numbers - title: FP8 numbers in HIP + - file: reference/low_fp_types + title: Low Precision Floating Point Types - file: reference/hardware_features - caption: Tutorials From bd74a067b41b4100ef9c4d4aa4e946eaae72e16b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 3 Mar 2025 00:44:37 +0000 Subject: [PATCH 11/32] Bump sphinxcontrib-doxylink from 1.12.4 to 1.13.0 in /docs/sphinx Bumps [sphinxcontrib-doxylink](https://github.com/sphinx-contrib/doxylink) from 1.12.4 to 1.13.0. - [Release notes](https://github.com/sphinx-contrib/doxylink/releases) - [Changelog](https://github.com/sphinx-contrib/doxylink/blob/master/CHANGELOG.md) - [Commits](https://github.com/sphinx-contrib/doxylink/compare/1.12.4...1.13.0) --- updated-dependencies: - dependency-name: sphinxcontrib-doxylink dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- docs/sphinx/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index 3707dd92a3..9582abd942 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -252,7 +252,7 @@ sphinxcontrib-applehelp==2.0.0 # via sphinx sphinxcontrib-devhelp==2.0.0 # via sphinx -sphinxcontrib-doxylink==1.12.4 +sphinxcontrib-doxylink==1.13.0 # via -r requirements.in sphinxcontrib-htmlhelp==2.1.0 # via sphinx From 392ba81f8d160f356749f839d439d8cd092cd852 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 4 Mar 2025 00:17:04 +0000 Subject: [PATCH 12/32] Bump rocm-docs-core[api_reference] from 1.17.0 to 1.17.1 in /docs/sphinx Bumps [rocm-docs-core[api_reference]](https://github.com/ROCm/rocm-docs-core) from 1.17.0 to 1.17.1. - [Release notes](https://github.com/ROCm/rocm-docs-core/releases) - [Changelog](https://github.com/ROCm/rocm-docs-core/blob/develop/CHANGELOG.md) - [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.17.0...v1.17.1) --- updated-dependencies: - dependency-name: rocm-docs-core[api_reference] dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- docs/sphinx/requirements.in | 2 +- docs/sphinx/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in index f7fb1c634a..1d65d55880 100644 --- a/docs/sphinx/requirements.in +++ b/docs/sphinx/requirements.in @@ -1,2 +1,2 @@ -rocm-docs-core[api_reference]==1.17.0 +rocm-docs-core[api_reference]==1.17.1 sphinxcontrib.doxylink diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index 9582abd942..6db9f3e428 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -211,7 +211,7 @@ requests==2.32.3 # via # pygithub # sphinx -rocm-docs-core[api-reference]==1.17.0 +rocm-docs-core[api-reference]==1.17.1 # via -r requirements.in rpds-py==0.22.3 # via From 4b4eb06db1077191d32051fa3d4f85b30e37c63a Mon Sep 17 00:00:00 2001 From: Adel Johar Date: Fri, 24 Jan 2025 15:00:27 +0100 Subject: [PATCH 13/32] Docs: Update math api page --- .wordlist.txt | 3 + docs/conf.py | 2 +- docs/reference/math_api.rst | 2399 +++++++++++++++++++++-------------- 3 files changed, 1421 insertions(+), 983 deletions(-) diff --git a/.wordlist.txt b/.wordlist.txt index 009c63b73f..c35dfb045d 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -154,6 +154,7 @@ sceneries shaders SIMT sinewave +sinf SOMA SPMV structs @@ -167,6 +168,8 @@ templated toolkits transfering typedefs +ULP +ULPs unintuitive UMM unmap diff --git a/docs/conf.py b/docs/conf.py index 8261240fb0..6e4b994bfb 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -57,4 +57,4 @@ "understand/glossary.md", 'how-to/debugging_env.rst', "data/env_variables_hip.rst" -] \ No newline at end of file +] diff --git a/docs/reference/math_api.rst b/docs/reference/math_api.rst index 58203de055..16c97c3637 100644 --- a/docs/reference/math_api.rst +++ b/docs/reference/math_api.rst @@ -8,1114 +8,1549 @@ HIP math API ******************************************************************************** -HIP-Clang supports a set of math operations that are callable from the device. HIP supports most of the device functions supported by NVIDIA CUDA. These are described in the following sections. +HIP-Clang provides device-callable math operations, supporting most functions available in +NVIDIA CUDA. + +This section documents: + +- Maximum error bounds for supported HIP math functions +- Currently unsupported functions + +For a comprehensive analysis of mathematical function accuracy—including detailed evaluations +in single, double, and quadruple precision and a discussion of the IEEE 754 standard's recommendations +on correct rounding — see the paper +`Accuracy of Mathematical Functions `_. + +Error bounds on this page are measured in units in the last place (ULPs), representing the absolute +difference between a HIP math function result and its corresponding C++ standard library function +(e.g., comparing HIP's sinf with C++'s sinf). + +The following C++ example shows a simplified method for computing ULP differences between +HIP and standard C++ math functions by first finding where the maximum absolute error +occurs. + +.. code-block:: cpp + + #include + #include + #include + #include + #include + + #define HIP_CHECK(expression) \ + { \ + const hipError_t err = expression; \ + if (err != hipSuccess) { \ + std::cerr << "HIP error: " \ + << hipGetErrorString(err) \ + << " at " << __LINE__ << "\n"; \ + exit(EXIT_FAILURE); \ + } \ + } + + // Simple ULP difference calculator + int64_t ulp_diff(float a, float b) { + if (a == b) return 0; + union { float f; int32_t i; } ua{a}, ub{b}; + + // For negative values, convert to a positive-based representation + if (ua.i < 0) ua.i = std::numeric_limits::max() - ua.i; + if (ub.i < 0) ub.i = std::numeric_limits::max() - ub.i; + + return std::abs((int64_t)ua.i - (int64_t)ub.i); + } + + // Test kernel + __global__ void test_sin(float* out, int n) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) { + float x = -M_PI + (2.0f * M_PI * i) / (n - 1); + out[i] = sin(x); + } + } + + int main() { + const int n = 1000000; + const int blocksize = 256; + std::vector outputs(n); + float* d_out; + + HIP_CHECK(hipMalloc(&d_out, n * sizeof(float))); + dim3 threads(blocksize); + dim3 blocks((n + blocksize - 1) / blocksize); // Fixed grid calculation + test_sin<<>>(d_out, n); + HIP_CHECK(hipPeekAtLastError()); + HIP_CHECK(hipMemcpy(outputs.data(), d_out, n * sizeof(float), hipMemcpyDeviceToHost)); + + // Step 1: Find the maximum absolute error + double max_abs_error = 0.0; + float max_error_output = 0.0; + float max_error_expected = 0.0; + + for (int i = 0; i < n; i++) { + float x = -M_PI + (2.0f * M_PI * i) / (n - 1); + float expected = std::sin(x); + double abs_error = std::abs(outputs[i] - expected); + + if (abs_error > max_abs_error) { + max_abs_error = abs_error; + max_error_output = outputs[i]; + max_error_expected = expected; + } + } + + // Step 2: Compute ULP difference based on the max absolute error pair + int64_t max_ulp = ulp_diff(max_error_output, max_error_expected); + + // Output results + std::cout << "Max Absolute Error: " << max_abs_error << std::endl; + std::cout << "Max ULP Difference: " << max_ulp << std::endl; + std::cout << "Max Error Values -> Got: " << max_error_output + << ", Expected: " << max_error_expected << std::endl; + + HIP_CHECK(hipFree(d_out)); + return 0; + } + +Standard mathematical functions +=============================== + +The functions in this section prioritize numerical accuracy and correctness, making them well-suited for +applications that require high precision and predictable results. Unless explicitly specified, all +math functions listed below are available on the device side. + +Arithmetic +---------- +.. tab-set:: + + .. tab-item:: Single Precision Floating-point + + .. list-table:: + :widths: 50,20,30 + + * - **Function** + - **Test Range** + - **ULP Difference of Maximum Absolute Error** + + * - | ``float abs(float x)`` + | Returns the absolute value of :math:`x` + - :math:`x \in [-20, 20]` + - 0 + + * - | ``float fabsf(float x)`` + | Returns the absolute value of `x` + - :math:`x \in [-20, 20]` + - 0 + + * - | ``float fdimf(float x, float y)`` + | Returns the positive difference between :math:`x` and :math:`y`. + - | :math:`x \in [-10, 10]` + | :math:`y \in [-3, 3]` + - 0 + + * - | ``float fmaf(float x, float y, float z)`` + | Returns :math:`x \cdot y + z` as a single operation. + - | :math:`x \in [-100, 100]` + | :math:`y \in [-10, 10]` + | :math:`z \in [-10, 10]` + - 0 + + * - | ``float fmaxf(float x, float y)`` + | Determine the maximum numeric value of :math:`x` and :math:`y`. + - | :math:`x \in [-10, 10]` + | :math:`y \in [-3, 3]` + - 0 + + * - | ``float fminf(float x, float y)`` + | Determine the minimum numeric value of :math:`x` and :math:`y`. + - | :math:`x \in [-10, 10]` + | :math:`y \in [-3, 3]` + - 0 + + * - | ``float fmodf(float x, float y)`` + | Returns the floating-point remainder of :math:`x / y`. + - | :math:`x \in [-10, 10]` + | :math:`y \in [-3, 3]` + - 0 + + * - | ``float modff(float x, float* iptr)`` + | Break down :math:`x` into fractional and integral parts. + - :math:`x \in [-10, 10]` + - 0 + + * - | ``float remainderf(float x, float y)`` + | Returns single-precision floating-point remainder. + - | :math:`x \in [-10, 10]` + | :math:`y \in [-3, 3]` + - 0 + + * - | ``float remquof(float x, float y, int* quo)`` + | Returns single-precision floating-point remainder and part of quotient. + - | :math:`x \in [-10, 10]` + | :math:`y \in [-3, 3]` + - 0 + + * - | ``float fdividef(float x, float y)`` + | Divide two floating point values. + - | :math:`x \in [-100, 100]` + | :math:`y \in [-100, 100]` + - 0 + + + .. tab-item:: Double Precision Floating-point + + .. list-table:: + :widths: 50,20,30 + + * - **Function** + - **Test Range** + - **ULP Difference of Maximum Absolute Error** + + * - | ``double abs(double x)`` + | Returns the absolute value of :math:`x` + - :math:`x \in [-20, 20]` + - 0 + + * - | ``double fabs(double x)`` + | Returns the absolute value of `x` + - :math:`x \in [-20, 20]` + - 0 + + * - | ``double fdim(double x, double y)`` + | Returns the positive difference between :math:`x` and :math:`y`. + - | :math:`x \in [-10, 10]` + | :math:`y \in [-3, 3]` + - 0 + + * - | ``double fma(double x, double y, double z)`` + | Returns :math:`x \cdot y + z` as a single operation. + - | :math:`x \in [-100, 100]` + | :math:`y \in [-10, 10]` + | :math:`z \in [-10, 10]` + - 0 + + * - | ``double fmax(double x, double y)`` + | Determine the maximum numeric value of :math:`x` and :math:`y`. + - | :math:`x \in [-10, 10]` + | :math:`y \in [-3, 3]` + - 0 + + * - | ``double fmin(double x, double y)`` + | Determine the minimum numeric value of :math:`x` and :math:`y`. + - | :math:`x \in [-10, 10]` + | :math:`y \in [-3, 3]` + - 0 + + * - | ``double fmod(double x, double y)`` + | Returns the floating-point remainder of :math:`x / y`. + - | :math:`x \in [-10, 10]` + | :math:`y \in [-3, 3]` + - 0 + + * - | ``double modf(double x, double* iptr)`` + | Break down :math:`x` into fractional and integral parts. + - :math:`x \in [-10, 10]` + - 0 + + * - | ``double remainder(double x, double y)`` + | Returns double-precision floating-point remainder. + - | :math:`x \in [-10, 10]` + | :math:`y \in [-3, 3]` + - 0 + + * - | ``double remquo(double x, double y, int* quo)`` + | Returns double-precision floating-point remainder and part of quotient. + - | :math:`x \in [-10, 10]` + | :math:`y \in [-3, 3]` + - 0 + +Classification +-------------- +.. tab-set:: + + .. tab-item:: Single Precision Floating-point + + .. list-table:: + :widths: 50,20,30 + + * - **Function** + - **Test Range** + - **ULP Difference of Maximum Absolute Error** + + * - | ``bool isfinite(float x)`` + | Determine whether :math:`x` is finite. + - | :math:`x \in [-\text{FLT_MAX}, \text{FLT_MAX}]` + | Special values: :math:`\pm\infty`, NaN + - 0 + + * - | ``bool isinf(float x)`` + | Determine whether :math:`x` is infinite. + - | :math:`x \in [-\text{FLT_MAX}, \text{FLT_MAX}]` + | Special values: :math:`\pm\infty`, NaN + - 0 + + * - | ``bool isnan(float x)`` + | Determine whether :math:`x` is a ``NAN``. + - | :math:`x \in [-\text{FLT_MAX}, \text{FLT_MAX}]` + | Special values: :math:`\pm\infty`, NaN + - 0 + + * - | ``bool signbit(float x)`` + | Return the sign bit of :math:`x`. + - | :math:`x \in [-\text{FLT_MAX}, \text{FLT_MAX}]` + | Special values: :math:`\pm\infty`, :math:`\pm0`, NaN + - 0 + + * - | ``float nanf(const char* tagp)`` + | Returns "Not a Number" value. + - | Input strings: ``""``, ``"1"``, ``"2"``, + | ``"quiet"``, ``"signaling"``, ``"ind"`` + - 0 + + .. tab-item:: Double Precision Floating-point + + .. list-table:: + :widths: 50,20,30 + + * - **Function** + - **Test Range** + - **ULP Difference of Maximum Absolute Error** + + * - | ``bool isfinite(double x)`` + | Determine whether :math:`x` is finite. + - | :math:`x \in [-\text{DBL_MAX}, \text{DBL_MAX}]` + | Special values: :math:`\pm\infty`, NaN + - 0 + + * - | ``bool isin(double x)`` + | Determine whether :math:`x` is infinite. + - | :math:`x \in [-\text{DBL_MAX}, \text{DBL_MAX}]` + | Special values: :math:`\pm\infty`, NaN + - 0 + + * - | ``bool isnan(double x)`` + | Determine whether :math:`x` is a ``NAN``. + - | :math:`x \in [-\text{DBL_MAX}, \text{DBL_MAX}]` + | Special values: :math:`\pm\infty`, NaN + - 0 + + * - | ``bool signbit(double x)`` + | Return the sign bit of :math:`x`. + - | :math:`x \in [-\text{DBL_MAX}, \text{DBL_MAX}]` + | Special values: :math:`\pm\infty`, :math:`\pm0`, NaN + - 0 + + * - | ``double nan(const char* tagp)`` + | Returns "Not a Number" value. + - | Input strings: ``""``, ``"1"``, ``"2"``, + | ``"quiet"``, ``"signaling"``, ``"ind"`` + - 0 + +Error and Gamma +--------------- +.. tab-set:: + + .. tab-item:: Single Precision Floating-point + + .. list-table:: + :widths: 50,20,30 + + * - **Function** + - **Test Range** + - **ULP Difference of Maximum Absolute Error** + + * - | ``float erff(float x)`` + | Returns the error function of :math:`x`. + - :math:`x \in [-4, 4]` + - 4 + + * - | ``float erfcf(float x)`` + | Returns the complementary error function of :math:`x`. + - :math:`x \in [-4, 4]` + - 2 + + * - | ``float erfcxf(float x)`` + | Returns the scaled complementary error function of :math:`x`. + - :math:`x \in [-2, 2]` + - 5 + + * - | ``float lgammaf(float x)`` + | Returns the natural logarithm of the absolute value of the gamma function of :math:`x`. + - :math:`x \in [0.5, 20]` + - 4 + + * - | ``float tgammaf(float x)`` + | Returns the gamma function of :math:`x`. + - :math:`x \in [0.5, 15]` + - 6 + + .. tab-item:: Double Precision Floating-point + + .. list-table:: + :widths: 50,20,30 + + * - **Function** + - **Test Range** + - **ULP Difference of Maximum Absolute Error** + + * - | ``double erf(double x)`` + | Returns the error function of :math:`x`. + - :math:`x \in [-4, 4]` + - 4 + + * - | ``double erfc(double x)`` + | Returns the complementary error function of :math:`x`. + - :math:`x \in [-4, 4]` + - 2 + + * - | ``double erfcx(double x)`` + | Returns the scaled complementary error function of :math:`x`. + - :math:`x \in [-2, 2]` + - 5 + + * - | ``double lgamma(double x)`` + | Returns the natural logarithm of the absolute value of the gamma function of :math:`x`. + - :math:`x \in [0.5, 20]` + - 2 + + * - | ``double tgamma(double x)`` + | Returns the gamma function of :math:`x`. + - :math:`x \in [0.5, 15]` + - 6 + +Exponential and Logarithmic +--------------------------- +.. tab-set:: + + .. tab-item:: Single Precision Floating-point + + .. list-table:: + :widths: 50,20,30 + + * - **Function** + - **Test Range** + - **ULP Difference of Maximum Absolute Error** + + * - | ``float expf(float x)`` + | Returns :math:`e^x`. + - :math:`x \in [-10, 10]` + - 1 + + * - | ``float exp2f(float x)`` + | Returns :math:`2^x`. + - :math:`x \in [-10, 10]` + - 1 + + * - | ``float exp10f(float x)`` + | Returns :math:`10^x`. + - :math:`x \in [-4, 4]` + - 1 + + * - | ``float expm1f(float x)`` + | Returns :math:`ln(x - 1)` + - :math:`x \in [-10, 10]` + - 1 + + * - | ``float log10f(float x)`` + | Returns the base 10 logarithm of :math:`x`. + - :math:`x \in [10^{-6}, 10^6]` + - 2 + + * - | ``float log1pf(float x)`` + | Returns the natural logarithm of :math:`x + 1`. + - :math:`x \in [-0.9, 10]` + - 1 + + * - | ``float log2f(float x)`` + | Returns the base 2 logarithm of :math:`x`. + - :math:`x \in [10^{-6}, 10^6]` + - 1 + + * - | ``float logf(float x)`` + | Returns the natural logarithm of :math:`x`. + - :math:`x \in [10^{-6}, 10^6]` + - 2 + + .. tab-item:: Double Precision Floating-point + + .. list-table:: + :widths: 50,20,30 + + * - **Function** + - **Test Range** + - **ULP Difference of Maximum Absolute Error** + + * - | ``double exp(double x)`` + | Returns :math:`e^x`. + - :math:`x \in [-10, 10]` + - 1 + + * - | ``double exp2(double x)`` + | Returns :math:`2^x`. + - :math:`x \in [-10, 10]` + - 1 + + * - | ``double exp10(double x)`` + | Returns :math:`10^x`. + - :math:`x \in [-4, 4]` + - 1 + + * - | ``double expm1(double x)`` + | Returns :math:`ln(x - 1)` + - :math:`x \in [-10, 10]` + - 1 + + * - | ``double log10(double x)`` + | Returns the base 10 logarithm of :math:`x`. + - :math:`x \in [10^{-6}, 10^6]` + - 1 + + * - | ``double log1p(double x)`` + | Returns the natural logarithm of :math:`x + 1`. + - :math:`x \in [-0.9, 10]` + - 1 + + * - | ``double log2(double x)`` + | Returns the base 2 logarithm of :math:`x`. + - :math:`x \in [10^{-6}, 10^6]` + - 1 + + * - | ``double log(double x)`` + | Returns the natural logarithm of :math:`x`. + - :math:`x \in [10^{-6}, 10^6]` + - 1 + +Floating Point Manipulation +--------------------------- +.. tab-set:: + + .. tab-item:: Single Precision Floating-point + + .. list-table:: + :widths: 50,20,30 + + * - **Function** + - **Test Range** + - **ULP Difference of Maximum Absolute Error** + + * - | ``float copysignf(float x, float y)`` + | Create value with given magnitude, copying sign of second value. + - | :math:`x \in [-10, 10]` + | :math:`y \in [-3, 3]` + - 0 + + * - | ``float frexpf(float x, int* nptr)`` + | Extract mantissa and exponent of :math:`x`. + - :math:`x \in [-10, 10]` + - 0 + + * - | ``int ilogbf(float x)`` + | Returns the unbiased integer exponent of :math:`x`. + - :math:`x \in [0.01, 100]` + - 0 + + * - | ``float logbf(float x)`` + | Returns the floating point representation of the exponent of :math:`x`. + - :math:`x \in [10^{-6}, 10^6]` + - 0 + + * - | ``float ldexpf(float x, int exp)`` + | Returns the natural logarithm of the absolute value of the gamma function of :math:`x`. + - | :math:`x \in [-10, 10]` + | :math:`\text{exp} \in [-4, 4]` + - 0 + + * - | ``float nextafterf(float x, float y)`` + | Returns next representable single-precision floating-point value after argument. + - | :math:`x \in [-10, 10]` + | :math:`y \in [-3, 3]` + - 0 + + * - | ``float scalblnf(float x, long int n)`` + | Scale :math:`x` by :math:`2^n`. + - | :math:`x \in [-10, 10]` + | :math:`n \in [-4, 4]` + - 0 + + * - | ``float scalbnf(float x, int n)`` + | Scale :math:`x` by :math:`2^n`. + - | :math:`x \in [-10, 10]` + | :math:`n \in [-4, 4]` + - 0 + + .. tab-item:: Double Precision Floating-point + + .. list-table:: + :widths: 50,20,30 + + * - **Function** + - **Test Range** + - **ULP Difference of Maximum Absolute Error** + + * - | ``double copysign(double x, double y)`` + | Create value with given magnitude, copying sign of second value. + - | :math:`x \in [-10, 10]` + | :math:`y \in [-3, 3]` + - 0 + + * - | ``double frexp(double x, int* nptr)`` + | Extract mantissa and exponent of :math:`x`. + - :math:`x \in [-10, 10]` + - 0 + + * - | ``int ilogb(double x)`` + | Returns the unbiased integer exponent of :math:`x`. + - :math:`x \in [0.01, 100]` + - 0 + + * - | ``double logb(double x)`` + | Returns the floating point representation of the exponent of :math:`x`. + - :math:`x \in [10^{-6}, 10^6]` + - 0 + + * - | ``double ldexp(double x, int exp)`` + | Returns the natural logarithm of the absolute value of the gamma function of :math:`x`. + - | :math:`x \in [-10, 10]` + | :math:`\text{exp} \in [-4, 4]` + - 0 + + * - | ``double nextafter(double x, double y)`` + | Returns next representable double-precision floating-point value after argument. + - | :math:`x \in [-10, 10]` + | :math:`y \in [-3, 3]` + - 0 + + * - | ``double scalbln(double x, long int n)`` + | Scale :math:`x` by :math:`2^n`. + - | :math:`x \in [-10, 10]` + | :math:`n \in [-4, 4]` + - 0 + + * - | ``double scalbn(double x, int n)`` + | Scale :math:`x` by :math:`2^n`. + - | :math:`x \in [-10, 10]` + | :math:`n \in [-4, 4]` + - 0 + +Hypotenuse and Norm +------------------- +.. tab-set:: + + .. tab-item:: Single Precision Floating-point + + .. list-table:: + :widths: 50,20,30 + + * - **Function** + - **Test Range** + - **ULP Difference of Maximum Absolute Error** + + * - | ``float hypotf(float x, float y)`` + | Returns the square root of the sum of squares of :math:`x` and :math:`y`. + - | :math:`x \in [-10, 10]` + | :math:`y \in [0, 10]` + - 1 + + * - | ``float rhypotf(float x, float y)`` + | Returns one over the square root of the sum of squares of two arguments. + - | :math:`x \in [-100, 100]` + | :math:`y \in [-10, 100]` + - 1 + + * - | ``float norm3df(float x, float y, float z)`` + | Returns the square root of the sum of squares of :math:`x`, :math:`y` and :math:`z`. + - | All inputs in + | :math:`[-10, 10]` + - 1 + + * - | ``float norm4df(float x, float y, float z, float w)`` + | Returns the square root of the sum of squares of :math:`x`, :math:`y`, :math:`z` and :math:`w`. + - | All inputs in + | :math:`[-10, 10]` + - 2 + + * - | ``float rnorm3df(float x, float y, float z)`` + | Returns one over the square root of the sum of squares of three coordinates of the argument. + - | All inputs in + | :math:`[-10, 10]` + - 1 + + * - | ``float rnorm4df(float x, float y, float z, float w)`` + | Returns one over the square root of the sum of squares of four coordinates of the argument. + - | All inputs in + | :math:`[-10, 10]` + - 2 + + * - | ``float normf(int dim, const float *a)`` + | Returns the square root of the sum of squares of any number of coordinates. + - | :math:`\text{dim} \in [2,4]` + | :math:`a[i] \in [-10, 10]` + - | Error depends on the number of coordinates + | e.g. ``dim = 2`` -> 1 + | e.g. ``dim = 3`` -> 1 + | e.g. ``dim = 4`` -> 1 + + * - | ``float rnormf(int dim, const float *a)`` + | Returns the reciprocal of square root of the sum of squares of any number of coordinates. + - | :math:`\text{dim} \in [2,4]` + | :math:`a[i] \in [-10, 10]` + - | Error depends on the number of coordinates + | e.g. ``dim = 2`` -> 1 + | e.g. ``dim = 3`` -> 1 + | e.g. ``dim = 4`` -> 1 + + .. tab-item:: Double Precision Floating-point + + .. list-table:: + :widths: 50,20,30 + + * - **Function** + - **Test Range** + - **ULP Difference of Maximum Absolute Error** + + * - | ``double hypot(double x, double y)`` + | Returns the square root of the sum of squares of :math:`x` and :math:`y`. + - | :math:`x \in [-10, 10]` + | :math:`y \in [0, 10]` + - 1 + + * - | ``double rhypot(double x, double y)`` + | Returns one over the square root of the sum of squares of two arguments. + - | :math:`x \in [-100, 100]` + | :math:`y \in [-10, 100]` + - 1 + + * - | ``double norm3d(double x, double y, double z)`` + | Returns the square root of the sum of squares of :math:`x`, :math:`y` and :math:`z`. + - | All inputs in + | :math:`[-10, 10]` + - 1 + + * - | ``double norm4d(double x, double y, double z, double w)`` + | Returns the square root of the sum of squares of :math:`x`, :math:`y`, :math:`z` and :math:`w`. + - | All inputs in + | :math:`[-10, 10]` + - 2 + + * - | ``double rnorm3d(double x, double y, double z)`` + | Returns one over the square root of the sum of squares of three coordinates of the argument. + - | All inputs in + | :math:`[-10, 10]` + - 1 + + * - | ``double rnorm4d(double x, double y, double z, double w)`` + | Returns one over the square root of the sum of squares of four coordinates of the argument. + - | All inputs in + | :math:`[-10, 10]` + - 1 + + * - | ``double norm(int dim, const double *a)`` + | Returns the square root of the sum of squares of any number of coordinates. + - | :math:`\text{dim} \in [2,4]` + | :math:`a[i] \in [-10, 10]` + - | Error depends on the number of coordinates + | e.g. ``dim = 2`` -> 1 + | e.g. ``dim = 3`` -> 1 + | e.g. ``dim = 4`` -> 1 + + * - | ``double rnorm(int dim, const double *a)`` + | Returns the reciprocal of square root of the sum of squares of any number of coordinates. + - | :math:`\text{dim} \in [2,4]` + | :math:`a[i] \in [-10, 10]` + - | Error depends on the number of coordinates + | e.g. ``dim = 2`` -> 1 + | e.g. ``dim = 3`` -> 1 + | e.g. ``dim = 4`` -> 1 + + +Power and Root +-------------- +.. tab-set:: + + .. tab-item:: Single Precision Floating-point + + .. list-table:: + :widths: 50,20,30 + + * - **Function** + - **Test Range** + - **ULP Difference of Maximum Absolute Error** + + * - | ``float cbrtf(float x)`` + | Returns the cube root of :math:`x`. + - :math:`x \in [-100, 100]` + - 2 + + * - | ``float powf(float x, float y)`` + | Returns :math:`x^y`. + - | :math:`x \in [-4, 4]` + | :math:`y \in [-2, 2]` + - 1 + + * - | ``float powif(float base, int iexp)`` + | Returns the value of first argument to the power of second argument. + - | :math:`\text{base} \in [-10, 10]` + | :math:`\text{iexp} \in [-4, 4]` + - 1 + + * - | ``float sqrtf(float x)`` + | Returns the square root of :math:`x`. + - :math:`x \in [0, 100]` + - 1 + + * - | ``float rsqrtf(float x)`` + | Returns the reciprocal of the square root of :math:`x`. + - :math:`x \in [0.01, 100]` + - 1 + + * - | ``float rcbrtf(float x)`` + | Returns the reciprocal cube root function. + - :math:`x \in [-100, 100]` + - 1 + + .. tab-item:: Double Precision Floating-point + + .. list-table:: + :widths: 50,20,30 + + * - **Function** + - **Test Range** + - **ULP Difference of Maximum Absolute Error** + + * - | ``double cbrt(double x)`` + | Returns the cube root of :math:`x`. + - :math:`x \in [-100, 100]` + - 1 + + * - | ``double pow(double x, double y)`` + | Returns :math:`x^y`. + - | :math:`x \in [-4, 4]` + | :math:`y \in [-2, 2]` + - 1 + + * - | ``double powi(double base, int iexp)`` + | Returns the value of first argument to the power of second argument. + - | :math:`\text{base} \in [-10, 10]` + | :math:`\text{iexp} \in [-4, 4]` + - 1 + + * - | ``double sqrt(double x)`` + | Returns the square root of :math:`x`. + - :math:`x \in [0, 100]` + - 1 + + * - | ``double rsqrt(double x)`` + | Returns the reciprocal of the square root of :math:`x`. + - :math:`x \in [0.01, 100]` + - 1 + + * - | ``double rcbrt(double x)`` + | Returns the reciprocal cube root function. + - :math:`x \in [-100, 100]` + - 1 + +Rounding +-------- +.. tab-set:: + + .. tab-item:: Single Precision Floating-point + + .. list-table:: + :widths: 50,20,30 + + * - **Function** + - **Test Range** + - **ULP Difference of Maximum Absolute Error** + + * - | ``float ceilf(float x)`` + | Returns ceiling of :math:`x`. + - :math:`x \in [-4, 4]` + - 0 + + * - | ``float floorf(float x)`` + | Returns the largest integer less than or equal to :math:`x`. + - :math:`x \in [-4, 4]` + - 0 + + * - | ``long int lroundf(float x)`` + | Round to nearest integer value. + - :math:`x \in [-4, 4]` + - 0 + + * - | ``long long int llroundf(float x)`` + | Round to nearest integer value. + - :math:`x \in [-4, 4]` + - 0 + + * - | ``long int lrintf(float x)`` + | Round :math:`x` to nearest integer value. + - :math:`x \in [-4, 4]` + - 0 + + * - | ``long long int llrintf(float x)`` + | Round :math:`x` to nearest integer value. + - :math:`x \in [-4, 4]` + - 0 + + * - | ``float nearbyintf(float x)`` + | Round :math:`x` to the nearest integer. + - :math:`x \in [-4, 4]` + - 0 + + * - | ``float roundf(float x)`` + | Round to nearest integer value in floating-point. + - :math:`x \in [-4, 4]` + - 0 + + * - | ``float rintf(float x)`` + | Round input to nearest integer value in floating-point. + - :math:`x \in [-4, 4]` + - 0 + + * - | ``float truncf(float x)`` + | Truncate :math:`x` to the integral part. + - :math:`x \in [-4, 4]` + - 0 + + .. tab-item:: Double Precision Floating-point + + .. list-table:: + :widths: 50,20,30 + + * - **Function** + - **Test Range** + - **ULP Difference of Maximum Absolute Error** + + * - | ``double ceil(double x)`` + | Returns ceiling of :math:`x`. + - :math:`x \in [-4, 4]` + - 0 + + * - | ``double floor(double x)`` + | Returns the largest integer less than or equal to :math:`x`. + - :math:`x \in [-4, 4]` + - 0 + + * - | ``long int lround(double x)`` + | Round to nearest integer value. + - :math:`x \in [-4, 4]` + - 0 + + * - | ``long long int llround(double x)`` + | Round to nearest integer value. + - :math:`x \in [-4, 4]` + - 0 + + * - | ``long int lrint(double x)`` + | Round :math:`x` to nearest integer value. + - :math:`x \in [-4, 4]` + - 0 + + * - | ``long long int llrint(double x)`` + | Round :math:`x` to nearest integer value. + - :math:`x \in [-4, 4]` + - 0 + + * - | ``double nearbyint(double x)`` + | Round :math:`x` to the nearest integer. + - :math:`x \in [-4, 4]` + - 0 + + * - | ``double round(double x)`` + | Round to nearest integer value in floating-point. + - :math:`x \in [-4, 4]` + - 0 + + * - | ``double rint(double x)`` + | Round input to nearest integer value in floating-point. + - :math:`x \in [-4, 4]` + - 0 + + * - | ``double trunc(double x)`` + | Truncate :math:`x` to the integral part. + - :math:`x \in [-4, 4]` + - 0 + +Trigonometric and Hyperbolic +---------------------------- +.. tab-set:: + + .. tab-item:: Single Precision Floating-point + + .. list-table:: + :widths: 50,20,30 + + * - **Function** + - **Test Range** + - **ULP Difference of Maximum Absolute Error** + + * - | ``float acosf(float x)`` + | Returns the arc cosine of :math:`x`. + - :math:`x \in [-1, 1]` + - 1 + + * - | ``float acoshf(float x)`` + | Returns the nonnegative arc hyperbolic cosine of :math:`x`. + - :math:`x \in [1, 100]` + - 1 + + * - | ``float asinf(float x)`` + | Returns the arc sine of :math:`x`. + - :math:`x \in [-1, 1]` + - 2 + + * - | ``float asinhf(float x)`` + | Returns the arc hyperbolic sine of :math:`x`. + - :math:`x \in [-10, 10]` + - 1 + + * - | ``float atanf(float x)`` + | Returns the arc tangent of :math:`x`. + - :math:`x \in [-10, 10]` + - 2 + + * - | ``float atan2f(float x, float y)`` + | Returns the arc tangent of the ratio of :math:`x` and :math:`y`. + - | :math:`x \in [-4, 4]` + | :math:`y \in [-2, 2]` + - 1 + + * - | ``float atanhf(float x)`` + | Returns the arc hyperbolic tangent of :math:`x`. + - :math:`x \in [-0.9, 0.9]` + - 1 + + * - | ``float cosf(float x)`` + | Returns the cosine of :math:`x`. + - :math:`x \in [-\pi, \pi]` + - 1 + + * - | ``float coshf(float x)`` + | Returns the hyperbolic cosine of :math:`x`. + - :math:`x \in [-5, 5]` + - 1 + + * - | ``float sinf(float x)`` + | Returns the sine of :math:`x`. + - :math:`x \in [-\pi, \pi]` + - 1 + + * - | ``float sinhf(float x)`` + | Returns the hyperbolic sine of :math:`x`. + - :math:`x \in [-5, 5]` + - 1 + + * - | ``void sincosf(float x, float *sptr, float *cptr)`` + | Returns the sine and cosine of :math:`x`. + - :math:`x \in [-3, 3]` + - | ``sin``: 1 + | ``cos``: 1 + + * - | ``float tanf(float x)`` + | Returns the tangent of :math:`x`. + - :math:`x \in [-1.47\pi, 1.47\pi]` + - 1 + + * - | ``float tanhf(float x)`` + | Returns the hyperbolic tangent of :math:`x`. + - :math:`x \in [-5, 5]` + - 2 + + * - | ``float cospif(float x)`` + | Returns the cosine of :math:`\pi \cdot x`. + - :math:`x \in [-0.3, 0.3]` + - 1 + + * - | ``float sinpif(float x)`` + | Returns the hyperbolic sine of :math:`\pi \cdot x`. + - :math:`x \in [-0.625, 0.625]` + - 2 + + * - | ``void sincospif(float x, float *sptr, float *cptr)`` + | Returns the sine and cosine of :math:`\pi \cdot x`. + - :math:`x \in [-0.3, 0.3]` + - | ``sinpi``: 2 + | ``cospi``: 1 + + .. tab-item:: Double Precision Floating-point + + .. list-table:: + :widths: 50,20,30 + + * - **Function** + - **Test Range** + - **ULP Difference of Maximum Absolute Error** + + * - | ``double acos(double x)`` + | Returns the arc cosine of :math:`x`. + - :math:`x \in [-1, 1]` + - 1 + + * - | ``double acosh(double x)`` + | Returns the nonnegative arc hyperbolic cosine of :math:`x`. + - :math:`x \in [1, 100]` + - 1 + + * - | ``double asin(double x)`` + | Returns the arc sine of :math:`x`. + - :math:`x \in [-1, 1]` + - 1 + + * - | ``double asinh(double x)`` + | Returns the arc hyperbolic sine of :math:`x`. + - :math:`x \in [-10, 10]` + - 1 + + * - | ``double atan(double x)`` + | Returns the arc tangent of :math:`x`. + - :math:`x \in [-10, 10]` + - 1 + + * - | ``double atan2(double x, double y)`` + | Returns the arc tangent of the ratio of :math:`x` and :math:`y`. + - | :math:`x \in [-4, 4]` + | :math:`y \in [-2, 2]` + - 1 + + * - | ``double atanh(double x)`` + | Returns the arc hyperbolic tangent of :math:`x`. + - :math:`x \in [-0.9, 0.9]` + - 1 + + * - | ``double cos(double x)`` + | Returns the cosine of :math:`x`. + - :math:`x \in [-\pi, \pi]` + - 1 + + * - | ``double cosh(double x)`` + | Returns the hyperbolic cosine of :math:`x`. + - :math:`x \in [-5, 5]` + - 1 -Single precision mathematical functions -======================================= + * - | ``double sin(double x)`` + | Returns the sine of :math:`x`. + - :math:`x \in [-\pi, \pi]` + - 1 + * - | ``double sinh(double x)`` + | Returns the hyperbolic sine of :math:`x`. + - :math:`x \in [-5, 5]` + - 1 -Following is the list of supported single precision mathematical functions. + * - | ``void sincos(double x, double *sptr, double *cptr)`` + | Returns the sine and cosine of :math:`x`. + - :math:`x \in [-3, 3]` + - | ``sin``: 1 + | ``cos``: 1 -.. list-table:: Single precision mathematical functions + * - | ``double tan(double x)`` + | Returns the tangent of :math:`x`. + - :math:`x \in [-1.47\pi, 1.47\pi]` + - 1 - * - **Function** - - **Supported on Host** - - **Supported on Device** - - * - | ``float abs(float x)`` - | Returns the absolute value of :math:`x` - - ✓ - - ✓ - - * - | ``float acosf(float x)`` - | Returns the arc cosine of :math:`x`. - - ✓ - - ✓ - - * - | ``float acoshf(float x)`` - | Returns the nonnegative arc hyperbolic cosine of :math:`x`. - - ✓ - - ✓ - - * - | ``float asinf(float x)`` - | Returns the arc sine of :math:`x`. - - ✓ - - ✓ - - * - | ``float asinhf(float x)`` - | Returns the arc hyperbolic sine of :math:`x`. - - ✓ - - ✓ - - * - | ``float atanf(float x)`` - | Returns the arc tangent of :math:`x`. - - ✓ - - ✓ - - * - | ``float atan2f(float x, float y)`` - | Returns the arc tangent of the ratio of :math:`x` and :math:`y`. - - ✓ - - ✓ - - * - | ``float atanhf(float x)`` - | Returns the arc hyperbolic tangent of :math:`x`. - - ✓ - - ✓ - - * - | ``float cbrtf(float x)`` - | Returns the cube root of :math:`x`. - - ✓ - - ✓ - - * - | ``float ceilf(float x)`` - | Returns ceiling of :math:`x`. - - ✓ - - ✓ - - * - | ``float copysignf(float x, float y)`` - | Create value with given magnitude, copying sign of second value. - - ✓ - - ✓ - - * - | ``float cosf(float x)`` - | Returns the cosine of :math:`x`. - - ✓ - - ✓ - - * - | ``float coshf(float x)`` - | Returns the hyperbolic cosine of :math:`x`. - - ✓ - - ✓ - - * - | ``float cospif(float x)`` - | Returns the cosine of :math:`\pi \cdot x`. - - ✓ - - ✓ - - * - | ``float cyl_bessel_i0f(float x)`` - | Returns the value of the regular modified cylindrical Bessel function of order 0 for :math:`x`. - - ✗ - - ✗ - - * - | ``float cyl_bessel_i1f(float x)`` - | Returns the value of the regular modified cylindrical Bessel function of order 1 for :math:`x`. - - ✗ - - ✗ - - * - | ``float erff(float x)`` - | Returns the error function of :math:`x`. - - ✓ - - ✓ - - * - | ``float erfcf(float x)`` - | Returns the complementary error function of :math:`x`. - - ✓ - - ✓ - - * - | ``float erfcinvf(float x)`` - | Returns the inverse complementary function of :math:`x`. - - ✓ - - ✓ - - * - | ``float erfcxf(float x)`` - | Returns the scaled complementary error function of :math:`x`. - - ✓ - - ✓ - - * - | ``float erfinvf(float x)`` - | Returns the inverse error function of :math:`x`. - - ✓ - - ✓ - - * - | ``float expf(float x)`` - | Returns :math:`e^x`. - - ✓ - - ✓ - - * - | ``float exp10f(float x)`` - | Returns :math:`10^x`. - - ✓ - - ✓ - - * - | ``float exp2f( float x)`` - | Returns :math:`2^x`. - - ✓ - - ✓ - - * - | ``float expm1f(float x)`` - | Returns :math:`ln(x - 1)` - - ✓ - - ✓ - - * - | ``float fabsf(float x)`` - | Returns the absolute value of `x` - - ✓ - - ✓ - - * - | ``float fdimf(float x, float y)`` - | Returns the positive difference between :math:`x` and :math:`y`. - - ✓ - - ✓ - - * - | ``float fdividef(float x, float y)`` - | Divide two floating point values. - - ✓ - - ✓ - - * - | ``float floorf(float x)`` - | Returns the largest integer less than or equal to :math:`x`. - - ✓ - - ✓ - - * - | ``float fmaf(float x, float y, float z)`` - | Returns :math:`x \cdot y + z` as a single operation. - - ✓ - - ✓ - - * - | ``float fmaxf(float x, float y)`` - | Determine the maximum numeric value of :math:`x` and :math:`y`. - - ✓ - - ✓ - - * - | ``float fminf(float x, float y)`` - | Determine the minimum numeric value of :math:`x` and :math:`y`. - - ✓ - - ✓ - - * - | ``float fmodf(float x, float y)`` - | Returns the floating-point remainder of :math:`x / y`. - - ✓ - - ✓ - - * - | ``float modff(float x, float* iptr)`` - | Break down :math:`x` into fractional and integral parts. - - ✓ - - ✗ - - * - | ``float frexpf(float x, int* nptr)`` - | Extract mantissa and exponent of :math:`x`. - - ✓ - - ✗ - - * - | ``float hypotf(float x, float y)`` - | Returns the square root of the sum of squares of :math:`x` and :math:`y`. - - ✓ - - ✓ - - * - | ``int ilogbf(float x)`` - | Returns the unbiased integer exponent of :math:`x`. - - ✓ - - ✓ - - * - | ``bool isfinite(float x)`` - | Determine whether :math:`x` is finite. - - ✓ - - ✓ - - * - | ``bool isinf(float x)`` - | Determine whether :math:`x` is infinite. - - ✓ - - ✓ - - * - | ``bool isnan(float x)`` - | Determine whether :math:`x` is a ``NAN``. - - ✓ - - ✓ - - * - | ``float j0f(float x)`` - | Returns the value of the Bessel function of the first kind of order 0 for :math:`x`. - - ✓ - - ✓ - - * - | ``float j1f(float x)`` - | Returns the value of the Bessel function of the first kind of order 1 for :math:`x`. - - ✓ - - ✓ - - * - | ``float jnf(int n, float x)`` - | Returns the value of the Bessel function of the first kind of order n for :math:`x`. - - ✓ - - ✓ - - * - | ``float ldexpf(float x, int exp)`` - | Returns the natural logarithm of the absolute value of the gamma function of :math:`x`. - - ✓ - - ✓ - - * - | ``float lgammaf(float x)`` - | Returns the natural logarithm of the absolute value of the gamma function of :math:`x`. - - ✓ - - ✗ - - * - | ``long int lrintf(float x)`` - | Round :math:`x` to nearest integer value. - - ✓ - - ✓ - - * - | ``long long int llrintf(float x)`` - | Round :math:`x` to nearest integer value. - - ✓ - - ✓ - - * - | ``long int lroundf(float x)`` - | Round to nearest integer value. - - ✓ - - ✓ - - * - | ``long long int llroundf(float x)`` - | Round to nearest integer value. - - ✓ - - ✓ - - * - | ``float log10f(float x)`` - | Returns the base 10 logarithm of :math:`x`. - - ✓ - - ✓ - - * - | ``float log1pf(float x)`` - | Returns the natural logarithm of :math:`x + 1`. - - ✓ - - ✓ - - * - | ``float log2f(float x)`` - | Returns the base 2 logarithm of :math:`x`. - - ✓ - - ✓ - - * - | ``float logf(float x)`` - | Returns the natural logarithm of :math:`x`. - - ✓ - - ✓ - - * - | ``float logbf(float x)`` - | Returns the floating point representation of the exponent of :math:`x`. - - ✓ - - ✓ - - * - | ``float nanf(const char* tagp)`` - | Returns "Not a Number" value. - - ✗ - - ✓ - - * - | ``float nearbyintf(float x)`` - | Round :math:`x` to the nearest integer. - - ✓ - - ✓ - - * - | ``float nextafterf(float x, float y)`` - | Returns next representable single-precision floating-point value after argument. - - ✓ - - ✗ - - * - | ``float norm3df(float x, float y, float z)`` - | Returns the square root of the sum of squares of :math:`x`, :math:`y` and :math:`z`. - - ✓ - - ✓ - - * - | ``float norm4df(float x, float y, float z, float w)`` - | Returns the square root of the sum of squares of :math:`x`, :math:`y`, :math:`z` and :math:`w`. - - ✓ - - ✓ - - * - | ``float normcdff(float y)`` - | Returns the standard normal cumulative distribution function. - - ✓ - - ✓ - - * - | ``float normcdfinvf(float y)`` - | Returns the inverse of the standard normal cumulative distribution function. - - ✓ - - ✓ - - * - | ``float normf(int dim, const float *a)`` - | Returns the square root of the sum of squares of any number of coordinates. - - ✓ - - ✓ - - * - | ``float powf(float x, float y)`` - | Returns :math:`x^y`. - - ✓ - - ✓ - - * - | ``float powif(float base, int iexp)`` - | Returns the value of first argument to the power of second argument. - - ✓ - - ✓ - - * - | ``float remainderf(float x, float y)`` - | Returns single-precision floating-point remainder. - - ✓ - - ✓ - - * - | ``float remquof(float x, float y, int* quo)`` - | Returns single-precision floating-point remainder and part of quotient. - - ✓ - - ✓ - - * - | ``float roundf(float x)`` - | Round to nearest integer value in floating-point. - - ✓ - - ✓ - - * - | ``float rcbrtf(float x)`` - | Returns the reciprocal cube root function. - - ✓ - - ✓ - - * - | ``float rhypotf(float x, float y)`` - | Returns one over the square root of the sum of squares of two arguments. - - ✓ - - ✓ - - * - | ``float rintf(float x)`` - | Round input to nearest integer value in floating-point. - - ✓ - - ✓ - - * - | ``float rnorm3df(float x, float y, float z)`` - | Returns one over the square root of the sum of squares of three coordinates of the argument. - - ✓ - - ✓ - - * - | ``float rnorm4df(float x, float y, float z, float w)`` - | Returns one over the square root of the sum of squares of four coordinates of the argument. - - ✓ - - ✓ - - * - | ``float rnormf(int dim, const float *a)`` - | Returns the reciprocal of square root of the sum of squares of any number of coordinates. - - ✓ - - ✓ - - * - | ``float scalblnf(float x, long int n)`` - | Scale :math:`x` by :math:`2^n`. - - ✓ - - ✓ - - * - | ``float scalbnf(float x, int n)`` - | Scale :math:`x` by :math:`2^n`. - - ✓ - - ✓ - - * - | ``bool signbit(float x)`` - | Return the sign bit of :math:`x`. - - ✓ - - ✓ - - * - | ``float sinf(float x)`` - | Returns the sine of :math:`x`. - - ✓ - - ✓ - - * - | ``float sinhf(float x)`` - | Returns the hyperbolic sine of :math:`x`. - - ✓ - - ✓ - - * - | ``float sinpif(float x)`` - | Returns the hyperbolic sine of :math:`\pi \cdot x`. - - ✓ - - ✓ - - * - | ``void sincosf(float x, float *sptr, float *cptr)`` - | Returns the sine and cosine of :math:`x`. - - ✓ - - ✓ - - * - | ``void sincospif(float x, float *sptr, float *cptr)`` - | Returns the sine and cosine of :math:`\pi \cdot x`. - - ✓ - - ✓ - - * - | ``float sqrtf(float x)`` - | Returns the square root of :math:`x`. - - ✓ - - ✓ - - * - | ``float rsqrtf(float x)`` - | Returns the reciprocal of the square root of :math:`x`. - - ✗ - - ✓ - - * - | ``float tanf(float x)`` - | Returns the tangent of :math:`x`. - - ✓ - - ✓ - - * - | ``float tanhf(float x)`` - | Returns the hyperbolic tangent of :math:`x`. - - ✓ - - ✓ - - * - | ``float tgammaf(float x)`` - | Returns the gamma function of :math:`x`. - - ✓ - - ✓ - - * - | ``float truncf(float x)`` - | Truncate :math:`x` to the integral part. - - ✓ - - ✓ - - * - | ``float y0f(float x)`` - | Returns the value of the Bessel function of the second kind of order 0 for :math:`x`. - - ✓ - - ✓ - - * - | ``float y1f(float x)`` - | Returns the value of the Bessel function of the second kind of order 1 for :math:`x`. - - ✓ - - ✓ - - * - | ``float ynf(int n, float x)`` - | Returns the value of the Bessel function of the second kind of order n for :math:`x`. - - ✓ - - ✓ - -Double precision mathematical functions -======================================= - -Following is the list of supported double precision mathematical functions. - -.. list-table:: Double precision mathematical functions + * - | ``double tanh(double x)`` + | Returns the hyperbolic tangent of :math:`x`. + - :math:`x \in [-5, 5]` + - 1 - * - **Function** - - **Supported on Host** - - **Supported on Device** - - * - | ``double abs(double x)`` - | Returns the absolute value of :math:`x` - - ✓ - - ✓ - - * - | ``double acos(double x)`` - | Returns the arc cosine of :math:`x`. - - ✓ - - ✓ - - * - | ``double acosh(double x)`` - | Returns the nonnegative arc hyperbolic cosine of :math:`x`. - - ✓ - - ✓ - - * - | ``double asin(double x)`` - | Returns the arc sine of :math:`x`. - - ✓ - - ✓ - - * - | ``double asinh(double x)`` - | Returns the arc hyperbolic sine of :math:`x`. - - ✓ - - ✓ - - * - | ``double atan(double x)`` - | Returns the arc tangent of :math:`x`. - - ✓ - - ✓ - - * - | ``double atan2(double x, double y)`` - | Returns the arc tangent of the ratio of :math:`x` and :math:`y`. - - ✓ - - ✓ - - * - | ``double atanh(double x)`` - | Returns the arc hyperbolic tangent of :math:`x`. - - ✓ - - ✓ - - * - | ``double cbrt(double x)`` - | Returns the cube root of :math:`x`. - - ✓ - - ✓ - - * - | ``double ceil(double x)`` - | Returns ceiling of :math:`x`. - - ✓ - - ✓ - - * - | ``double copysign(double x, double y)`` - | Create value with given magnitude, copying sign of second value. - - ✓ - - ✓ - - * - | ``double cos(double x)`` - | Returns the cosine of :math:`x`. - - ✓ - - ✓ - - * - | ``double cosh(double x)`` - | Returns the hyperbolic cosine of :math:`x`. - - ✓ - - ✓ - - * - | ``double cospi(double x)`` - | Returns the cosine of :math:`\pi \cdot x`. - - ✓ - - ✓ - - * - | ``double cyl_bessel_i0(double x)`` - | Returns the value of the regular modified cylindrical Bessel function of order 0 for :math:`x`. - - ✗ - - ✗ - - * - | ``double cyl_bessel_i1(double x)`` - | Returns the value of the regular modified cylindrical Bessel function of order 1 for :math:`x`. - - ✗ - - ✗ - - * - | ``double erf(double x)`` - | Returns the error function of :math:`x`. - - ✓ - - ✓ - - * - | ``double erfc(double x)`` - | Returns the complementary error function of :math:`x`. - - ✓ - - ✓ - - * - | ``double erfcinv(double x)`` - | Returns the inverse complementary function of :math:`x`. - - ✓ - - ✓ - - * - | ``double erfcx(double x)`` - | Returns the scaled complementary error function of :math:`x`. - - ✓ - - ✓ - - * - | ``double erfinv(double x)`` - | Returns the inverse error function of :math:`x`. - - ✓ - - ✓ - - * - | ``double exp(double x)`` - | Returns :math:`e^x`. - - ✓ - - ✓ - - * - | ``double exp10(double x)`` - | Returns :math:`10^x`. - - ✓ - - ✓ - - * - | ``double exp2( double x)`` - | Returns :math:`2^x`. - - ✓ - - ✓ - - * - | ``double expm1(double x)`` - | Returns :math:`ln(x - 1)` - - ✓ - - ✓ - - * - | ``double fabs(double x)`` - | Returns the absolute value of `x` - - ✓ - - ✓ - - * - | ``double fdim(double x, double y)`` - | Returns the positive difference between :math:`x` and :math:`y`. - - ✓ - - ✓ - - * - | ``double floor(double x)`` - | Returns the largest integer less than or equal to :math:`x`. - - ✓ - - ✓ - - * - | ``double fma(double x, double y, double z)`` - | Returns :math:`x \cdot y + z` as a single operation. - - ✓ - - ✓ - - * - | ``double fmax(double x, double y)`` - | Determine the maximum numeric value of :math:`x` and :math:`y`. - - ✓ - - ✓ - - * - | ``double fmin(double x, double y)`` - | Determine the minimum numeric value of :math:`x` and :math:`y`. - - ✓ - - ✓ - - * - | ``double fmod(double x, double y)`` - | Returns the floating-point remainder of :math:`x / y`. - - ✓ - - ✓ - - * - | ``double modf(double x, double* iptr)`` - | Break down :math:`x` into fractional and integral parts. - - ✓ - - ✗ - - * - | ``double frexp(double x, int* nptr)`` - | Extract mantissa and exponent of :math:`x`. - - ✓ - - ✗ - - * - | ``double hypot(double x, double y)`` - | Returns the square root of the sum of squares of :math:`x` and :math:`y`. - - ✓ - - ✓ - - * - | ``int ilogb(double x)`` - | Returns the unbiased integer exponent of :math:`x`. - - ✓ - - ✓ - - * - | ``bool isfinite(double x)`` - | Determine whether :math:`x` is finite. - - ✓ - - ✓ - - * - | ``bool isin(double x)`` - | Determine whether :math:`x` is infinite. - - ✓ - - ✓ - - * - | ``bool isnan(double x)`` - | Determine whether :math:`x` is a ``NAN``. - - ✓ - - ✓ - - * - | ``double j0(double x)`` - | Returns the value of the Bessel function of the first kind of order 0 for :math:`x`. - - ✓ - - ✓ - - * - | ``double j1(double x)`` - | Returns the value of the Bessel function of the first kind of order 1 for :math:`x`. - - ✓ - - ✓ - - * - | ``double jn(int n, double x)`` - | Returns the value of the Bessel function of the first kind of order n for :math:`x`. - - ✓ - - ✓ - - * - | ``double ldexp(double x, int exp)`` - | Returns the natural logarithm of the absolute value of the gamma function of :math:`x`. - - ✓ - - ✓ - - * - | ``double lgamma(double x)`` - | Returns the natural logarithm of the absolute value of the gamma function of :math:`x`. - - ✓ - - ✗ - - * - | ``long int lrint(double x)`` - | Round :math:`x` to nearest integer value. - - ✓ - - ✓ - - * - | ``long long int llrint(double x)`` - | Round :math:`x` to nearest integer value. - - ✓ - - ✓ - - * - | ``long int lround(double x)`` - | Round to nearest integer value. - - ✓ - - ✓ - - * - | ``long long int llround(double x)`` - | Round to nearest integer value. - - ✓ - - ✓ - - * - | ``double log10(double x)`` - | Returns the base 10 logarithm of :math:`x`. - - ✓ - - ✓ - - * - | ``double log1p(double x)`` - | Returns the natural logarithm of :math:`x + 1`. - - ✓ - - ✓ - - * - | ``double log2(double x)`` - | Returns the base 2 logarithm of :math:`x`. - - ✓ - - ✓ - - * - | ``double log(double x)`` - | Returns the natural logarithm of :math:`x`. - - ✓ - - ✓ - - * - | ``double logb(double x)`` - | Returns the floating point representation of the exponent of :math:`x`. - - ✓ - - ✓ - - * - | ``double nan(const char* tagp)`` - | Returns "Not a Number" value. - - ✗ - - ✓ - - * - | ``double nearbyint(double x)`` - | Round :math:`x` to the nearest integer. - - ✓ - - ✓ - - * - | ``double nextafter(double x, double y)`` - | Returns next representable double-precision floating-point value after argument. - - ✓ - - ✓ - - * - | ``double norm3d(double x, double y, double z)`` - | Returns the square root of the sum of squares of :math:`x`, :math:`y` and :math:`z`. - - ✓ - - ✓ - - * - | ``double norm4d(double x, double y, double z, double w)`` - | Returns the square root of the sum of squares of :math:`x`, :math:`y`, :math:`z` and :math:`w`. - - ✓ - - ✓ - - * - | ``double normcdf(double y)`` - | Returns the standard normal cumulative distribution function. - - ✓ - - ✓ - - * - | ``double normcdfinv(double y)`` - | Returns the inverse of the standard normal cumulative distribution function. - - ✓ - - ✓ - - * - | ``double norm(int dim, const double *a)`` - | Returns the square root of the sum of squares of any number of coordinates. - - ✓ - - ✓ - - * - | ``double pow(double x, double y)`` - | Returns :math:`x^y`. - - ✓ - - ✓ - - * - | ``double powi(double base, int iexp)`` - | Returns the value of first argument to the power of second argument. - - ✓ - - ✓ - - * - | ``double remainder(double x, double y)`` - | Returns double-precision floating-point remainder. - - ✓ - - ✓ - - * - | ``double remquo(double x, double y, int* quo)`` - | Returns double-precision floating-point remainder and part of quotient. - - ✓ - - ✗ - - * - | ``double round(double x)`` - | Round to nearest integer value in floating-point. - - ✓ - - ✓ - - * - | ``double rcbrt(double x)`` - | Returns the reciprocal cube root function. - - ✓ - - ✓ - - * - | ``double rhypot(double x, double y)`` - | Returns one over the square root of the sum of squares of two arguments. - - ✓ - - ✓ - - * - | ``double rint(double x)`` - | Round input to nearest integer value in floating-point. - - ✓ - - ✓ - - * - | ``double rnorm3d(double x, double y, double z)`` - | Returns one over the square root of the sum of squares of three coordinates of the argument. - - ✓ - - ✓ - - * - | ``double rnorm4d(double x, double y, double z, double w)`` - | Returns one over the square root of the sum of squares of four coordinates of the argument. - - ✓ - - ✓ - - * - | ``double rnorm(int dim, const double *a)`` - | Returns the reciprocal of square root of the sum of squares of any number of coordinates. - - ✓ - - ✓ - - * - | ``double scalbln(double x, long int n)`` - | Scale :math:`x` by :math:`2^n`. - - ✓ - - ✓ - - * - | ``double scalbn(double x, int n)`` - | Scale :math:`x` by :math:`2^n`. - - ✓ - - ✓ - - * - | ``bool signbit(double x)`` - | Return the sign bit of :math:`x`. - - ✓ - - ✓ - - * - | ``double sin(double x)`` - | Returns the sine of :math:`x`. - - ✓ - - ✓ - - * - | ``double sinh(double x)`` - | Returns the hyperbolic sine of :math:`x`. - - ✓ - - ✓ - - * - | ``double sinpi(double x)`` - | Returns the hyperbolic sine of :math:`\pi \cdot x`. - - ✓ - - ✓ - - * - | ``void sincos(double x, double *sptr, double *cptr)`` - | Returns the sine and cosine of :math:`x`. - - ✓ - - ✓ - - * - | ``void sincospi(double x, double *sptr, double *cptr)`` - | Returns the sine and cosine of :math:`\pi \cdot x`. - - ✓ - - ✓ - - * - | ``double sqrt(double x)`` - | Returns the square root of :math:`x`. - - ✓ - - ✓ - - * - | ``double rsqrt(double x)`` - | Returns the reciprocal of the square root of :math:`x`. - - ✗ - - ✓ - - * - | ``double tan(double x)`` - | Returns the tangent of :math:`x`. - - ✓ - - ✓ - - * - | ``double tanh(double x)`` - | Returns the hyperbolic tangent of :math:`x`. - - ✓ - - ✓ - - * - | ``double tgamma(double x)`` - | Returns the gamma function of :math:`x`. - - ✓ - - ✓ - - * - | ``double trunc(double x)`` - | Truncate :math:`x` to the integral part. - - ✓ - - ✓ - - * - | ``double y0(double x)`` - | Returns the value of the Bessel function of the second kind of order 0 for :math:`x`. - - ✓ - - ✓ - - * - | ``double y1(double x)`` - | Returns the value of the Bessel function of the second kind of order 1 for :math:`x`. - - ✓ - - ✓ - - * - | ``double yn(int n, double x)`` - | Returns the value of the Bessel function of the second kind of order n for :math:`x`. - - ✓ - - ✓ + * - | ``double cospi(double x)`` + | Returns the cosine of :math:`\pi \cdot x`. + - :math:`x \in [-0.3, 0.3]` + - 2 -Integer intrinsics -================== + * - | ``double sinpi(double x)`` + | Returns the hyperbolic sine of :math:`\pi \cdot x`. + - :math:`x \in [-0.625, 0.625]` + - 2 -Following is the list of supported integer intrinsics. Note that intrinsics are supported on device only. + * - | ``void sincospi(double x, double *sptr, double *cptr)`` + | Returns the sine and cosine of :math:`\pi \cdot x`. + - :math:`x \in [-0.3, 0.3]` + - | ``sinpi``: 2 + | ``cospi``: 2 -.. list-table:: Integer intrinsics mathematical functions +No C++ STD Implementation +------------------------- - * - **Function** +This table lists HIP device functions that do not have a direct equivalent in the C++ standard library. +These functions were excluded from comparison due to the complexity of implementing a precise +reference version within the standard library's constraints. - * - | ``unsigned int __brev(unsigned int x)`` - | Reverse the bit order of a 32 bit unsigned integer. +.. tab-set:: - * - | ``unsigned long long int __brevll(unsigned long long int x)`` - | Reverse the bit order of a 64 bit unsigned integer. + .. tab-item:: Single Precision Floating-point - * - | ``unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int z)`` - | Return selected bytes from two 32-bit unsigned integers. + .. list-table:: - * - | ``unsigned int __clz(int x)`` - | Return the number of consecutive high-order zero bits in 32 bit integer. + * - **Function** - * - | ``unsigned int __clzll(long long int x)`` - | Return the number of consecutive high-order zero bits in 64 bit integer. + * - | ``float j0f(float x)`` + | Returns the value of the Bessel function of the first kind of order 0 for :math:`x`. - * - | ``unsigned int __ffs(int x)`` - | Find the position of least significant bit set to 1 in a 32 bit integer. + * - | ``float j1f(float x)`` + | Returns the value of the Bessel function of the first kind of order 1 for :math:`x`. - * - | ``unsigned int __ffsll(long long int x)`` - | Find the position of least significant bit set to 1 in a 64 bit signed integer. + * - | ``float jnf(int n, float x)`` + | Returns the value of the Bessel function of the first kind of order n for :math:`x`. - * - | ``unsigned int __fns32(unsigned long long mask, unsigned int base, int offset)`` - | Find the position of the n-th set to 1 bit in a 32-bit integer. + * - | ``float y0f(float x)`` + | Returns the value of the Bessel function of the second kind of order 0 for :math:`x`. - * - | ``unsigned int __fns64(unsigned long long int mask, unsigned int base, int offset)`` - | Find the position of the n-th set to 1 bit in a 64-bit integer. + * - | ``float y1f(float x)`` + | Returns the value of the Bessel function of the second kind of order 1 for :math:`x`. - * - | ``unsigned int __funnelshift_l(unsigned int lo, unsigned int hi, unsigned int shift)`` - | Concatenate :math:`hi` and :math:`lo`, shift left by shift & 31 bits, return the most significant 32 bits. + * - | ``float ynf(int n, float x)`` + | Returns the value of the Bessel function of the second kind of order n for :math:`x`. - * - | ``unsigned int __funnelshift_lc(unsigned int lo, unsigned int hi, unsigned int shift)`` - | Concatenate :math:`hi` and :math:`lo`, shift left by min(shift, 32) bits, return the most significant 32 bits. + * - | ``float erfcinvf(float x)`` + | Returns the inverse complementary function of :math:`x`. - * - | ``unsigned int __funnelshift_r(unsigned int lo, unsigned int hi, unsigned int shift)`` - | Concatenate :math:`hi` and :math:`lo`, shift right by shift & 31 bits, return the least significant 32 bits. + * - | ``float erfinvf(float x)`` + | Returns the inverse error function of :math:`x`. - * - | ``unsigned int __funnelshift_rc(unsigned int lo, unsigned int hi, unsigned int shift)`` - | Concatenate :math:`hi` and :math:`lo`, shift right by min(shift, 32) bits, return the least significant 32 bits. + * - | ``float normcdff(float y)`` + | Returns the standard normal cumulative distribution function. - * - | ``unsigned int __hadd(int x, int y)`` - | Compute average of signed input arguments, avoiding overflow in the intermediate sum. + * - | ``float normcdfinvf(float y)`` + | Returns the inverse of the standard normal cumulative distribution function. - * - | ``unsigned int __rhadd(int x, int y)`` - | Compute rounded average of signed input arguments, avoiding overflow in the intermediate sum. + .. tab-item:: Double Precision Floating-point - * - | ``unsigned int __uhadd(int x, int y)`` - | Compute average of unsigned input arguments, avoiding overflow in the intermediate sum. + .. list-table:: - * - | ``unsigned int __urhadd (unsigned int x, unsigned int y)`` - | Compute rounded average of unsigned input arguments, avoiding overflow in the intermediate sum. + * - **Function** - * - | ``int __sad(int x, int y, int z)`` - | Returns :math:`|x - y| + z`, the sum of absolute difference. + * - | ``double j0(double x)`` + | Returns the value of the Bessel function of the first kind of order 0 for :math:`x`. - * - | ``unsigned int __usad(unsigned int x, unsigned int y, unsigned int z)`` - | Returns :math:`|x - y| + z`, the sum of absolute difference. + * - | ``double j1(double x)`` + | Returns the value of the Bessel function of the first kind of order 1 for :math:`x`. - * - | ``unsigned int __popc(unsigned int x)`` - | Count the number of bits that are set to 1 in a 32 bit integer. + * - | ``double jn(int n, double x)`` + | Returns the value of the Bessel function of the first kind of order n for :math:`x`. - * - | ``unsigned int __popcll(unsigned long long int x)`` - | Count the number of bits that are set to 1 in a 64 bit integer. + * - | ``double y0(double x)`` + | Returns the value of the Bessel function of the second kind of order 0 for :math:`x`. - * - | ``int __mul24(int x, int y)`` - | Multiply two 24bit integers. + * - | ``double y1(double x)`` + | Returns the value of the Bessel function of the second kind of order 1 for :math:`x`. - * - | ``unsigned int __umul24(unsigned int x, unsigned int y)`` - | Multiply two 24bit unsigned integers. + * - | ``double yn(int n, double x)`` + | Returns the value of the Bessel function of the second kind of order n for :math:`x`. - * - | ``int __mulhi(int x, int y)`` - | Returns the most significant 32 bits of the product of the two 32-bit integers. + * - | ``double erfcinv(double x)`` + | Returns the inverse complementary function of :math:`x`. - * - | ``unsigned int __umulhi(unsigned int x, unsigned int y)`` - | Returns the most significant 32 bits of the product of the two 32-bit unsigned integers. + * - | ``double erfinv(double x)`` + | Returns the inverse error function of :math:`x`. - * - | ``long long int __mul64hi(long long int x, long long int y)`` - | Returns the most significant 64 bits of the product of the two 64-bit integers. + * - | ``double normcdf(double y)`` + | Returns the standard normal cumulative distribution function. - * - | ``unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y)`` - | Returns the most significant 64 bits of the product of the two 64 unsigned bit integers. + * - | ``double normcdfinv(double y)`` + | Returns the inverse of the standard normal cumulative distribution function. -The HIP-Clang implementation of ``__ffs()`` and ``__ffsll()`` contains code to add a constant +1 to produce the ``ffs`` result format. -For the cases where this overhead is not acceptable and programmer is willing to specialize for the platform, -HIP-Clang provides ``__lastbit_u32_u32(unsigned int input)`` and ``__lastbit_u32_u64(unsigned long long int input)``. -The index returned by ``__lastbit_`` instructions starts at -1, while for ``ffs`` the index starts at 0. +Unsupported +----------- -Floating-point Intrinsics -========================= +This table lists functions that are not supported by HIP. + +.. tab-set:: + + .. tab-item:: Single Precision Floating-point + + .. list-table:: -Following is the list of supported floating-point intrinsics. Note that intrinsics are supported on device only. + * - **Function** + + * - | ``float cyl_bessel_i0f(float x)`` + | Returns the value of the regular modified cylindrical Bessel function of order 0 for :math:`x`. + + * - | ``float cyl_bessel_i1f(float x)`` + | Returns the value of the regular modified cylindrical Bessel function of order 1 for :math:`x`. + + .. tab-item:: Double Precision Floating-point + + .. list-table:: + + * - **Function** + + * - | ``double cyl_bessel_i0(double x)`` + | Returns the value of the regular modified cylindrical Bessel function of order 0 for :math:`x`. + + * - | ``double cyl_bessel_i1(double x)`` + | Returns the value of the regular modified cylindrical Bessel function of order 1 for :math:`x`. + +Intrinsic mathematical functions +================================ + +Intrinsic math functions are optimized for performance on HIP-supported hardware. These functions often +trade some precision for faster execution, making them ideal for applications where computational +efficiency is a priority over strict numerical accuracy. Note that intrinsics are supported on device only. + +Floating-point Intrinsics +------------------------- .. note:: - Only the nearest even rounding mode supported on AMD GPUs by defaults. The ``_rz``, ``_ru`` and - ``_rd`` suffixed intrinsic functions are existing in HIP AMD backend, if the + Only the nearest-even rounding mode is supported by default on AMD GPUs. The ``_rz``, ``_ru``, and ``_rd`` + suffixed intrinsic functions exist in the HIP AMD backend if the ``OCML_BASIC_ROUNDED_OPERATIONS`` macro is defined. .. list-table:: Single precision intrinsics mathematical functions + :widths: 50,20,30 * - **Function** + - **Test Range** + - **ULP Difference of Maximum Absolute Error** * - | ``float __cosf(float x)`` | Returns the fast approximate cosine of :math:`x`. + - :math:`x \in [-\pi, \pi]` + - 4 * - | ``float __exp10f(float x)`` | Returns the fast approximate for 10 :sup:`x`. + - :math:`x \in [-4, 4]` + - 18 * - | ``float __expf(float x)`` | Returns the fast approximate for e :sup:`x`. + - :math:`x \in [-10, 10]` + - 6 * - | ``float __fadd_rn(float x, float y)`` | Add two floating-point values in round-to-nearest-even mode. + - | :math:`x \in [-1000, 1000]` + | :math:`y \in [-1000, 1000]` + - 0 * - | ``float __fdiv_rn(float x, float y)`` - | Divide two floating point values in round-to-nearest-even mode. + | Divide two floating-point values in round-to-nearest-even mode. + - | :math:`x \in [-100, 100]` + | :math:`y \in [-100, 100]` + - 0 * - | ``float __fmaf_rn(float x, float y, float z)`` | Returns ``x × y + z`` as a single operation in round-to-nearest-even mode. + - | :math:`x \in [-100, 100]` + | :math:`y \in [-10, 10]` + | :math:`z \in [-10, 10]` + - 0 * - | ``float __fmul_rn(float x, float y)`` | Multiply two floating-point values in round-to-nearest-even mode. + - | :math:`x \in [-100, 100]` + | :math:`y \in [-100, 100]` + - 0 * - | ``float __frcp_rn(float x, float y)`` | Returns ``1 / x`` in round-to-nearest-even mode. + - :math:`x \in [-100, 100]` + - 0 * - | ``float __frsqrt_rn(float x)`` | Returns ``1 / √x`` in round-to-nearest-even mode. + - :math:`x \in [0.01, 100]` + - 1 * - | ``float __fsqrt_rn(float x)`` | Returns ``√x`` in round-to-nearest-even mode. + - :math:`x \in [0, 100]` + - 1 * - | ``float __fsub_rn(float x, float y)`` | Subtract two floating-point values in round-to-nearest-even mode. + - | :math:`x \in [-1000, 1000]` + | :math:`y \in [-1000, 1000]` + - 0 * - | ``float __log10f(float x)`` | Returns the fast approximate for base 10 logarithm of :math:`x`. + - :math:`x \in [10^{-6}, 10^6]` + - 2 * - | ``float __log2f(float x)`` | Returns the fast approximate for base 2 logarithm of :math:`x`. + - :math:`x \in [10^{-6}, 10^6]` + - 1 * - | ``float __logf(float x)`` | Returns the fast approximate for natural logarithm of :math:`x`. + - :math:`x \in [10^{-6}, 10^6]` + - 2 * - | ``float __powf(float x, float y)`` | Returns the fast approximate of x :sup:`y`. + - | :math:`x \in [-4, 4]` + | :math:`y \in [-2, 2]` + - 1 * - | ``float __saturatef(float x)`` | Clamp :math:`x` to [+0.0, 1.0]. + - :math:`x \in [-2, 3]` + - 0 * - | ``float __sincosf(float x, float* sinptr, float* cosptr)`` | Returns the fast approximate of sine and cosine of :math:`x`. + - :math:`x \in [-3, 3]` + - | ``sin``: 18 + | ``cos``: 4 * - | ``float __sinf(float x)`` | Returns the fast approximate sine of :math:`x`. + - :math:`x \in [-\pi, \pi]` + - 18 * - | ``float __tanf(float x)`` | Returns the fast approximate tangent of :math:`x`. + - :math:`x \in [-1.47\pi, 1.47\pi]` + - 1 .. list-table:: Double precision intrinsics mathematical functions + :widths: 50,20,30 * - **Function** + - **Test Range** + - **ULP Difference of Maximum Absolute Error** * - | ``double __dadd_rn(double x, double y)`` | Add two floating-point values in round-to-nearest-even mode. + - | :math:`x \in [-1000, 1000]` + | :math:`y \in [-1000, 1000]` + - 0 * - | ``double __ddiv_rn(double x, double y)`` | Divide two floating-point values in round-to-nearest-even mode. + - | :math:`x \in [-100, 100]` + | :math:`y \in [-100, 100]` + - 0 * - | ``double __dmul_rn(double x, double y)`` | Multiply two floating-point values in round-to-nearest-even mode. + - | :math:`x \in [-100, 100]` + | :math:`y \in [-100, 100]` + - 0 * - | ``double __drcp_rn(double x, double y)`` | Returns ``1 / x`` in round-to-nearest-even mode. + - :math:`x \in [-100, 100]` + - 0 * - | ``double __dsqrt_rn(double x)`` | Returns ``√x`` in round-to-nearest-even mode. + - :math:`x \in [0, 100]` + - 0 * - | ``double __dsub_rn(double x, double y)`` | Subtract two floating-point values in round-to-nearest-even mode. + - | :math:`x \in [-1000, 1000]` + | :math:`y \in [-1000, 1000]` + - 0 * - | ``double __fma_rn(double x, double y, double z)`` | Returns ``x × y + z`` as a single operation in round-to-nearest-even mode. + - | :math:`x \in [-100, 100]` + | :math:`y \in [-10, 10]` + | :math:`z \in [-10, 10]` + - 0 + +Integer intrinsics +------------------ + +This section covers HIP integer intrinsic functions. ULP error values are omitted +since they only apply to floating-point operations, not integer arithmetic. + +.. list-table:: Integer intrinsics mathematical functions + + * - **Function** + + * - | ``unsigned int __brev(unsigned int x)`` + | Reverse the bit order of a 32 bit unsigned integer. + + * - | ``unsigned long long int __brevll(unsigned long long int x)`` + | Reverse the bit order of a 64 bit unsigned integer. + + * - | ``unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int z)`` + | Return selected bytes from two 32-bit unsigned integers. + + * - | ``unsigned int __clz(int x)`` + | Return the number of consecutive high-order zero bits in 32 bit integer. + + * - | ``unsigned int __clzll(long long int x)`` + | Return the number of consecutive high-order zero bits in 64 bit integer. + + * - | ``unsigned int __ffs(int x)`` [1]_ + | Returns the position of the first set bit in a 32 bit integer. + | Note: if ``x`` is ``0``, will return ``0`` + + * - | ``unsigned int __ffsll(long long int x)`` [1]_ + | Returns the position of the first set bit in a 64 bit signed integer. + | Note: if ``x`` is ``0``, will return ``0`` + + * - | ``unsigned int __fns32(unsigned long long mask, unsigned int base, int offset)`` + | Find the position of the n-th set to 1 bit in a 32-bit integer. + | Note: this intrinsic is emulated via software, so performance can be potentially slower + + * - | ``unsigned int __fns64(unsigned long long int mask, unsigned int base, int offset)`` + | Find the position of the n-th set to 1 bit in a 64-bit integer. + | Note: this intrinsic is emulated via software, so performance can be potentially slower + + * - | ``unsigned int __funnelshift_l(unsigned int lo, unsigned int hi, unsigned int shift)`` + | Concatenate :math:`hi` and :math:`lo`, shift left by shift & 31 bits, return the most significant 32 bits. + + * - | ``unsigned int __funnelshift_lc(unsigned int lo, unsigned int hi, unsigned int shift)`` + | Concatenate :math:`hi` and :math:`lo`, shift left by min(shift, 32) bits, return the most significant 32 bits. + + * - | ``unsigned int __funnelshift_r(unsigned int lo, unsigned int hi, unsigned int shift)`` + | Concatenate :math:`hi` and :math:`lo`, shift right by shift & 31 bits, return the least significant 32 bits. + + * - | ``unsigned int __funnelshift_rc(unsigned int lo, unsigned int hi, unsigned int shift)`` + | Concatenate :math:`hi` and :math:`lo`, shift right by min(shift, 32) bits, return the least significant 32 bits. + + * - | ``unsigned int __hadd(int x, int y)`` + | Compute average of signed input arguments, avoiding overflow in the intermediate sum. + + * - | ``unsigned int __rhadd(int x, int y)`` + | Compute rounded average of signed input arguments, avoiding overflow in the intermediate sum. + + * - | ``unsigned int __uhadd(int x, int y)`` + | Compute average of unsigned input arguments, avoiding overflow in the intermediate sum. + + * - | ``unsigned int __urhadd (unsigned int x, unsigned int y)`` + | Compute rounded average of unsigned input arguments, avoiding overflow in the intermediate sum. + + * - | ``int __sad(int x, int y, int z)`` + | Returns :math:`|x - y| + z`, the sum of absolute difference. + + * - | ``unsigned int __usad(unsigned int x, unsigned int y, unsigned int z)`` + | Returns :math:`|x - y| + z`, the sum of absolute difference. + + * - | ``unsigned int __popc(unsigned int x)`` + | Count the number of bits that are set to 1 in a 32 bit integer. + + * - | ``unsigned int __popcll(unsigned long long int x)`` + | Count the number of bits that are set to 1 in a 64 bit integer. + + * - | ``int __mul24(int x, int y)`` + | Multiply two 24bit integers. + + * - | ``unsigned int __umul24(unsigned int x, unsigned int y)`` + | Multiply two 24bit unsigned integers. + + * - | ``int __mulhi(int x, int y)`` + | Returns the most significant 32 bits of the product of the two 32-bit integers. + + * - | ``unsigned int __umulhi(unsigned int x, unsigned int y)`` + | Returns the most significant 32 bits of the product of the two 32-bit unsigned integers. + + * - | ``long long int __mul64hi(long long int x, long long int y)`` + | Returns the most significant 64 bits of the product of the two 64-bit integers. + + * - | ``unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y)`` + | Returns the most significant 64 bits of the product of the two 64 unsigned bit integers. + +.. [1] The HIP-Clang implementation of ``__ffs()`` and ``__ffsll()`` contains code to add a constant +1 to produce the ``ffs`` result format. + For the cases where this overhead is not acceptable and programmer is willing to specialize for the platform, + HIP-Clang provides ``__lastbit_u32_u32(unsigned int input)`` and ``__lastbit_u32_u64(unsigned long long int input)``. + The index returned by ``__lastbit_`` instructions starts at -1, while for ``ffs`` the index starts at 0. From 6f1733c7e843b970bde45237ded3b5ee2b685b81 Mon Sep 17 00:00:00 2001 From: Istvan Kiss Date: Mon, 10 Mar 2025 16:08:52 +0100 Subject: [PATCH 14/32] Remove external link --- docs/reference/math_api.rst | 5 ----- 1 file changed, 5 deletions(-) diff --git a/docs/reference/math_api.rst b/docs/reference/math_api.rst index 16c97c3637..504054ff91 100644 --- a/docs/reference/math_api.rst +++ b/docs/reference/math_api.rst @@ -16,11 +16,6 @@ This section documents: - Maximum error bounds for supported HIP math functions - Currently unsupported functions -For a comprehensive analysis of mathematical function accuracy—including detailed evaluations -in single, double, and quadruple precision and a discussion of the IEEE 754 standard's recommendations -on correct rounding — see the paper -`Accuracy of Mathematical Functions `_. - Error bounds on this page are measured in units in the last place (ULPs), representing the absolute difference between a HIP math function result and its corresponding C++ standard library function (e.g., comparing HIP's sinf with C++'s sinf). From 36f4fac08294219a2816839d99777a57a82c6a02 Mon Sep 17 00:00:00 2001 From: randyh62 Date: Wed, 15 Jan 2025 15:14:36 -0800 Subject: [PATCH 15/32] Update programming model --- .wordlist.txt | 6 +- .../hip_runtime_api/cooperative_groups.rst | 2 +- docs/understand/hardware_implementation.rst | 9 +- docs/understand/programming_model.rst | 400 ++++++++++++------ 4 files changed, 286 insertions(+), 131 deletions(-) diff --git a/.wordlist.txt b/.wordlist.txt index c35dfb045d..fab10155f6 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -7,7 +7,8 @@ APUs AQL AXPY asm -asynchrony +Asynchronicity +Asynchrony backtrace bfloat Bitcode @@ -71,6 +72,7 @@ hipModule hipModuleLaunchKernel hipother HIPRTC +hyperthreading icc IILE iGPU @@ -116,6 +118,7 @@ NDRange nonnegative NOP Numa +ns Nsight ocp omnitrace @@ -124,6 +127,7 @@ overindexing oversubscription overutilized parallelizable +pipelining parallelized pixelated pragmas diff --git a/docs/how-to/hip_runtime_api/cooperative_groups.rst b/docs/how-to/hip_runtime_api/cooperative_groups.rst index 3170e197ef..a3e32cd294 100644 --- a/docs/how-to/hip_runtime_api/cooperative_groups.rst +++ b/docs/how-to/hip_runtime_api/cooperative_groups.rst @@ -164,7 +164,7 @@ The ``thread_rank()`` , ``size()``, ``cg_type()``, ``is_valid()``, ``sync()``, ` Coalesced groups ------------------ -Threads (64 threads on CDNA and 32 threads on RDNA) in a warp cannot execute different instructions simultaneously, so conditional branches are executed serially within the warp. When threads encounter a conditional branch, they can diverge, resulting in some threads being disabled, if they do not meet the condition to execute that branch. The active threads referred as coalesced, and coalesced group represents an active thread group within a warp. +Threads (64 threads on CDNA and 32 threads on RDNA) in a warp cannot execute different instructions simultaneously, so conditional branches are executed serially within the warp. When threads encounter a conditional branch, they can diverge, resulting in some threads being disabled if they do not meet the condition to execute that branch. The active threads are referred to as coalesced, and coalesced group represents an active thread group within a warp. .. note:: diff --git a/docs/understand/hardware_implementation.rst b/docs/understand/hardware_implementation.rst index 7038262812..e57f7d4505 100644 --- a/docs/understand/hardware_implementation.rst +++ b/docs/understand/hardware_implementation.rst @@ -45,12 +45,13 @@ The amount of warps that can reside concurrently on a CU, known as occupancy, is determined by the warp's resource usage of registers and shared memory. +.. _gcn_cu: + .. figure:: ../data/understand/hardware_implementation/compute_unit.svg :alt: Diagram depicting the general structure of a compute unit of an AMD GPU. - An AMD Graphics Core Next (GCN) CU. The CDNA and RDNA CUs are based on - variations of the GCN CU. + AMD Graphics Core Next (GCN) CU On AMD GCN GPUs the basic structure of a CU is: @@ -102,6 +103,8 @@ The scalar unit performs instructions that are uniform within a warp. It thereby improves efficiency and reduces the pressure on the vector ALUs and the vector register file. +.. _cdna3_cu: + CDNA architecture ================= @@ -121,6 +124,8 @@ multiply-accumulate operations for Block Diagram of a CDNA3 Compute Unit. +.. _rdna3_cu: + RDNA architecture ================= diff --git a/docs/understand/programming_model.rst b/docs/understand/programming_model.rst index 6c7015996f..64a92df470 100644 --- a/docs/understand/programming_model.rst +++ b/docs/understand/programming_model.rst @@ -7,67 +7,91 @@ .. _programming_model: ******************************************************************************* -HIP programming model +Introduction to HIP programming model ******************************************************************************* The HIP programming model makes it easy to map data-parallel C/C++ algorithms to massively parallel, wide single instruction, multiple data (SIMD) architectures, -such as GPUs. +such as GPUs. HIP supports many imperative languages, such as Python via PyHIP, +but this document focuses on the original C/C++ API of HIP. -While the model may be expressed in most imperative languages, (for example -Python via PyHIP) this document will focus on the original C/C++ API of HIP. - -A basic understanding of the underlying device architecture helps you +While GPUs may be capable of running applications written for CPUs if properly ported +and compiled, it would not be an efficient use of GPU resources. GPUs are different +from CPUs in fundamental ways, and should be used accordingly to achieve optimum +performance. A basic understanding of the underlying device architecture helps you make efficient use of HIP and general purpose graphics processing unit (GPGPU) -programming in general. +programming in general. The following topics introduce you to the key concepts of +GPU-based programming, and the HIP programming model. + +Getting into Hardware: CPU vs GPU +================================= + +CPUs and GPUs have been designed for different purposes. CPUs have been designed +to quickly execute a single thread, decreasing the time it takes for a single +operation, increasing the amount of sequential instructions that can be executed. +This includes fetching data, and reducing pipeline stalls where the ALU has to +wait for previous instructions to finish. + +On CPUs the goal is to quickly process operations. CPUs provide low latency processing for +serial instructions. On the other hand, GPUs have been designed to execute many similar commands, or threads, +in parallel, achieving higher throughput. Latency is the delay from when an operation +is started to when it returns, such as 2 ns, while throughput is the number of operations completed +in a period of time, such as ten thousand threads completed. -RDNA & CDNA architecture summary -================================ +For the GPU, the objective is to process as many operations in parallel, rather +than to finish a single instruction quickly. GPUs in general are made up of basic +building blocks called compute units (CUs), that execute the threads of a kernel. +As described in :ref:`hardware_implementation`, these CUs provide the necessary +resources for the threads: the Arithmetic Logical Units (ALUs), register files, +caches and shared memory for efficient communication between the threads. -GPUs in general are made up of basic building blocks called compute units (CUs), -that execute the threads of a kernel. These CUs provide the necessary resources -for the threads: the Arithmetic Logical Units (ALUs), register files, caches and -shared memory for efficient communication between the threads. +The following defines a few hardware differences between CPUs and GPUs: -This design allows for efficient execution of kernels while also being able to -scale from small GPUs embedded in APUs with few CUs up to GPUs designed for data -centers with hundreds of CUs. Figure :ref:`rdna3_cu` and :ref:`cdna3_cu` show -examples of such compute units. +* CPU: -For architecture details, check :ref:`hardware_implementation`. + - Optimized for sequential processing with a few powerful cores (4-64 typically) + - High clock speeds (3-5 GHz) + - One register file per thread. On modern CPUs you have at most 2 register files per core, called hyperthreading. + - One ALU executing the thread. -.. _rdna3_cu: + - Designed to quickly execute instructions of the same thread. + - Complex branch prediction. -.. figure:: ../data/understand/programming_model/rdna3_cu.png - :alt: Block diagram showing the structure of an RDNA3 Compute Unit. It - consists of four SIMD units, each including a vector and scalar register - file, with the corresponding scalar and vector ALUs. All four SIMDs - share a scalar and instruction cache, as well as the shared memory. Two - of the SIMD units each share an L0 cache. + - Large L1/L2 cache per core, shared by fewer threads (maximum of 2 when hyperthreading is available). + - A disadvantage is switching execution from one thread to another (or context switching) takes a considerable amount of time: the ALU pipeline needs to be emptied, the register file has to be written to memory to free the register for another thread. + +* GPU: - Block Diagram of an RDNA3 Compute Unit. + - Designed for parallel processing with many simpler cores (hundreds/thousands) + - Lower clock speeds (1-2 GHz) + - Streamlined control logic + - Small caches, more registers + - Register files are shared among threads. The number of threads that can be run in parallel depends on the registers needed per thread. + - Multiple ALUs execute a collection of threads having the same operations, also known as a wavefront or warp. This is called single-instruction, multiple threads (SIMT) operation as described in :ref:`programming_model_simt`. -.. _cdna3_cu: + - The collection of ALUs is called SIMD. SIMDs are an extension to the hardware architecture, that allows a `single instruction` to concurrently operate on `multiple data` inputs. + - For branching threads where conditional instructions lead to thread divergence, ALUs still processes the full wavefront, but the result for divergent threads is masked out. This leads to wasted ALU cycles, and should be a consideration in your programming. Keep instructions consistent, and leave conditionals out of threads. -.. figure:: ../data/understand/programming_model/cdna3_cu.png - :alt: Block diagram showing the structure of a CDNA3 compute unit. It includes - Shader Cores, the Matrix Core Unit, a Local Data Share used for sharing - memory between threads in a block, an L1 Cache and a Scheduler. The - Shader Cores represent the vector ALUs and the Matrix Core Unit the - matrix ALUs. The Local Data Share is used as the shared memory. + - The advantage for GPUs is that context switching is easy. All threads that run on a core/compute unit have their registers on the compute unit, so they don't need to be stored to global memory, and each cycle one instruction from any wavefront that resides on the compute unit can be issued. - Block Diagram of a CDNA3 Compute Unit. +When programming for a heterogeneous system, which incorporates CPUs and GPUs, you must +write your program to take advantage of the strengths of the available hardware. +Use the CPU for tasks that require complex logic with conditional branching, to reduce the +time to reach a decision. Use the GPU for parallel operations of the same instruction +across large datasets, with little branching, where the volume of operations is the key. + +.. _heterogeneous_programming: Heterogeneous Programming ========================= -The HIP programming model assumes two execution contexts. One is referred to as -*host* while compute kernels execute on a *device*. These contexts have -different capabilities, therefor slightly different rules apply. The *host* -execution is defined by the C++ abstract machine, while *device* execution -follows the :ref:`SIMT model` of HIP. These execution contexts in -code are signified by the ``__host__`` and ``__device__`` decorators. There are -a few key differences between the two: +The HIP programming model has two execution contexts. The main application starts on the CPU, or +the *host* processor, and compute kernels are launched on the *device* such as `Instinct +accelerators `_ or AMD GPUs. +The host execution is defined by the C++ abstract machine, while device execution +follows the :ref:`SIMT model` of HIP. These two execution contexts +are signified by the ``__host__`` and ``__global__`` (or ``__device__``) decorators +in HIP program code. There are a few key differences between the two contexts: * The C++ abstract machine assumes a unified memory address space, meaning that one can always access any given address in memory (assuming the absence of @@ -75,65 +99,123 @@ a few key differences between the two: from one means nothing in another. Moreover, not all address spaces are accessible from all contexts. - Looking at :ref:`rdna3_cu` and :ref:`cdna3_cu`, you can see that - every CU has an instance of storage backing the namespace ``__shared__``. - Even if the host were to have access to these regions of - memory, the performance benefits of the segmented memory subsystem are + Looking at the :ref:`gcn_cu` figure, you can see that every CU has an instance of storage + backing the namespace ``__shared__``. Even if the host were to have access to these + regions of memory, the performance benefits of the segmented memory subsystem are supported by the inability of asynchronous access from the host. -* Not all C++ language features map cleanly to typical device architectures, - some are very expensive (meaning slow) to implement on GPU devices, therefor - they are forbidden in device contexts to avoid users tapping into features - that unexpectedly decimate their program's performance. Offload devices targeted - by HIP aren't general purpose devices, at least not in the sense that a CPU is. - HIP focuses on data parallel computations and as such caters to throughput - optimized architectures, such as GPUs or accelerators derived from GPU - architectures. +* Not all C++ language features map cleanly to typical GPU device architectures. + Some C++ features have poor latency when implemented on GPU devices, therefore + they are forbidden in device contexts to avoid using features that unexpectedly + decimate the program's performance. Offload devices targeted by HIP aren't general + purpose devices, at least not in the sense that a CPU is. HIP focuses on data + parallel computations and as such caters to throughput optimized architectures, + such as GPUs or accelerators derived from GPU architectures. -* Asynchrony is at the forefront of the HIP API. Computations launched on the device +* Asynchronicity is at the forefront of the HIP API. Computations launched on the device execute asynchronously with respect to the host, and it is the user's responsibility to synchronize their data dispatch/fetch with computations on the device. .. note:: - HIP does perform implicit synchronization on occasions, more advanced than other - APIs such as OpenCL or SYCL, in which the responsibility of synchronization mostly - depends on the user. + HIP performs implicit synchronization on occasions, unlike some + APIs where the responsibility for synchronization is left to the user. + +Host programming +---------------- + +In heterogeneous programming, the CPU is available for processing operations but the host application has the additional task of managing data and computation exchanges between the CPU (host) and GPU (device). The host acts as the application manager, coordinating the overall workflow and directing operations to the appropriate context, handles data preparation and data transfers, and manages GPU tasks and synchronization. Here is a typical sequence of operations: + +1. Initialize the HIP runtime and select the GPU: As described in :ref:`initialization`, refers to identifying and selecting a target GPU, setting up a context to let the CPU interact with the GPU. +2. Data preparation: As discussed in :ref:`memory_management`, this includes allocating the required memory on the host and device, preparing input data and transferring it from the host to the device. The data is both transferred to the device, and passed as an input parameter when launching the kernel. +3. Configure and launch the kernel on the GPU: As described in :ref:`device_program`, define and load the kernel or kernels to be run, launch kernels using the triple chevron syntax or appropriate API call (for example ``hipLaunchKernelGGL``), and pass parameters as needed. On the GPU, kernels are run on streams, or a queue of operations. Within the same stream operations run in the order they were issued, but different streams are independent and can execute concurrently. In the HIP runtime, kernels run on the default stream when one is not specified, but specifying a stream for the kernel lets you increase concurrency in task scheduling and resource utilization, and launch and manage multiple kernels from the host program. +4. Synchronization: As described in :ref:`asynchronous_how-to`, kernel execution occurs in the context of device streams, specifically the default (`0`) stream. You can use streams and events to manage task dependencies, overlap computation with data transfers, and manage asynchronous processes to ensure proper sequencing of operations. Wait for events or streams to finish execution and transfer results from the GPU back to the host. +5. Error handling: As described in :ref:`error_handling`, you should catch and handle potential errors from API calls, kernel launches, or memory operations. For example, use ``hipGetErrorString`` to retrieve error messages. +6. Cleanup and resource management: Validate results, clean up GPU contexts and resources, and free allocated memory on the host and devices. + +This structure allows for efficient use of GPU resources and facilitates the acceleration of compute-intensive tasks while keeping the host CPU available for other tasks. + +.. _device_program: + +Device programming +------------------ + +The device or kernel program acts as workers on the GPU application, distributing operations to be handled quickly and efficiently. Launching a kernel in the host application starts the kernel program running on the GPU, defining the parallel operations to repeat the same instructions across many datasets. Understanding how the kernel works and the processes involved is essential to writing efficient GPU applications. Threads, blocks, and grids provide a hierarchical approach to parallel operations. Understanding the thread hierarchy is critical to distributing work across the available CUs, managing parallel operations, and optimizing memory access. The general flow of the kernel program looks like this: + +1. Thread Grouping: As described in :ref:`inherent_thread_model`, threads are organized into a hierarchy consisting of threads which are individual instances of parallel operations, blocks that group the threads together, and grids that group blocks into the kernel. Each thread runs an instance of the kernel in parallel with other threads in the block. +2. Indexing: The kernel computes the unique index for each thread to access the relevant data to be processed by the thread. +3. Data Fetch: Threads fetch input data from memory previously transferred from the host to the device. As described in :ref:`memory_hierarchy`, the hierarchy of threads is influenced by the memory subsystem of GPUs. The memory hierarchy includes local memory per-thread with very fast access, shared memory for the block of threads which also supports quick access, and larger amounts of global memory visible to the whole kernel,but accesses are expensive due to high latency. Understanding the memory model is a key concept for kernel programming. +4. Computation: Threads perform the required computations on the input data, and generate any needed output. Each thread of the kernel runs the same instruction simultaneously on the different datasets. This sometimes require multiple iterations when the number of operations exceeds the resources of the CU. +5. Synchronization: When needed, threads synchronize within their block to ensure correct results when working with shared memory. + +Kernels can be simple single instruction programs deployed across multiple threads in wavefronts, as described below and as demonstrated in the `Hello World tutorial `_ or :doc:`../tutorial/saxpy`. However, heterogeneous GPU applications can also become quite complex, managing hundreds, thousands, or hundreds of thousands of operations with repeated data transfers between host and device to support massive parallelization, using multiple streams to manage concurrent asynchronous operations, using rich libraries of functions optimized for GPU hardware as described in the `ROCm documentation `_. .. _programming_model_simt: Single instruction multiple threads (SIMT) ========================================== -The SIMT programming model behind the HIP device-side execution is a middle-ground -between SMT (Simultaneous Multi-Threading) programming known from multicore CPUs, -and SIMD (Single Instruction, Multiple Data) programming mostly known from exploiting -relevant instruction sets on CPUs (for example SSE/AVX/Neon). +The HIP kernel code, which is written as a series of scalar instructions for multiple +threads with different thread indices, gets mapped to the SIMD units of the GPUs. +Every single instruction, which is executed for every participating thread of a +kernel, gets mapped to the SIMD. -A HIP device compiler maps SIMT code written in HIP C++ to an inherently SIMD -architecture (like GPUs). This is done by scalarizing the entire kernel and issuing the scalar -instructions of multiple kernel instances (called threads) to each of the SIMD engine lanes, rather -than exploiting data parallelism within a single instance of a kernel and spreading -identical instructions over the available SIMD engines. +This is done by grouping threads into warps, which contain as many threads as there +are physical lanes in a SIMD, and issuing that instruction to the SIMD for every +warp of a kernel. Ideally the SIMD is always fully utilized, however if the number of threads +can't be evenly divided by the warpSize, then the unused lanes are masked out +from the corresponding SIMD execution. -Consider the following kernel: +A kernel follows the same C++ rules as the functions on the host, but it has a special ``__global__`` label to mark it for execution on the device, as shown in the following example: .. code-block:: cpp - __global__ void k(float4* a, const float4* b) + __global__ void AddKernel(float* a, const float* b) { - int tid = threadIdx.x; - int bid = blockIdx.x; - int dim = blockDim.x; + int global_idx = threadIdx.x + blockIdx.x * blockDim.x; + + a[global_idx] += b[global_idx]; + } - a[tid] += (tid + bid - dim) * b[tid]; +One of the first things you might notice is the usage of the special ``threadIdx``, +``blockIdx`` and ``blockDim`` variables. Unlike normal C++ host functions, a kernel +is not launched once, but as often as specified by the user. Each of these instances +is a separate thread, with its own values for ``threadIdx``, ``blockIdx`` and ``blockDim``. + +The kernel program is launched from the host application using a language extension +called the triple chevron syntax, which looks like the following: + +.. code-block:: cpp + + AddKernel<<>>(a, b); + +Inside the angle brackets you provide the following: + +* The number of blocks to launch, which defines the grid size (relating to blockDim). +* The number of threads in a block, which defines the block size (relating to blockIdx). +* The amount of shared memory to allocate by the host, not specified above. +* The device stream to enqueue the operation on, not specified above so the default stream is used. + +.. note:: + The kernel can also be launched through other methods, such as the ``hipLaunchKernel()`` function. + +Here the total number of threads launched for the ``AddKernel`` program is defined by +``number_of_blocks * threads_per_block``. You define these values when launching the +kernel program to address the problem to be solved with the available resources within +the system. In other words, the thread configuration is customized to the needs of the +operations and the available hardware. + +For comparison, the ``AddKernel`` program could be written in plain C++ as a ``FOR`` loop: + +.. code-block:: cpp + + for(int i = 0; i < (number_of_blocks * threads_per_block); ++i){ + a[i] += b[i]; } -The incoming four-vector of floating-point values ``b`` is multiplied by a -scalar and then added element-wise to the four-vector floating-point values of -``a``. On modern SIMD-capable architectures, the four-vector ops are expected to -compile to a single SIMD instruction. However, GPU execution of this kernel will -typically break down the vector elements into 4 separate threads for parallel execution, -as seen in the following figure: +In HIP, lanes of the SIMD architecture are fed by mapping threads of a SIMT +execution, one thread down each lane of an SIMD engine. Execution parallelism +usually isn't exploited from the width of the built-in vector types, but across +multiple threads via the thread ID constants ``threadIdx.x``, ``blockIdx.x``, etc. .. _simt: @@ -143,28 +225,26 @@ as seen in the following figure: inside and ellipsis between the arrows. The instructions represented in the arrows are, from top to bottom: ADD, DIV, FMA, FMA, FMA and FMA. - Instruction flow of the sample SIMT program. - -In HIP, lanes of the SIMD architecture are fed by mapping threads of a SIMT -execution, one thread down each lane of an SIMD engine. Execution parallelism -usually isn't exploited from the width of the built-in vector types, but across multiple threads via the thread ID constants ``threadIdx.x``, ``blockIdx.x``, etc. + Instruction flow of a sample SIMT program. .. _inherent_thread_model: -Inherent thread model -===================== - -The SIMT nature of HIP is captured by the ability to execute user-provided -device programs, expressed as single-source C/C++ functions or sources compiled -online/offline to binaries, in bulk. +Hierarchical thread model +--------------------- -All threads of a kernel are uniquely identified by a set of integral values, called thread IDs. -The set of integers identifying a thread relate to the hierarchy in which the threads execute. +As previously discussed, all threads of a kernel are uniquely identified by a set +of integral values called thread IDs. The hierarchy consists of three levels: thread, +blocks, and grids. -The thread hierarchy inherent to how AMD GPUs operate is depicted in the -following figure. +* Threads are single instances of kernel operations, running concurrently across warps +* Blocks group threads together and enable cooperation and shared memory +* Grids define the number of thread blocks for a single kernel launch +* Blocks, and grids can be defined in 3 dimensions (``x``, ``y``, ``z``) +* By default, the Y and Z dimensions are set to 1 -.. _inherent_thread_hierarchy: +The combined values represent the thread index, and relate to the sequence that the +threads execute. The thread hierarchy is integral to how AMD GPUs operate, and is +depicted in the following figure. .. figure:: ../data/understand/programming_model/thread_hierarchy.svg :alt: Diagram depicting nested rectangles of varying color. The outermost one @@ -175,10 +255,13 @@ following figure. Hierarchy of thread groups. +.. _wavefront: + Warp (or Wavefront) - The innermost grouping of threads is called a warp, or a wavefront in ISA terms. A warp - is the most tightly coupled groups of threads, both physically and logically. Threads - inside a warp are also called lanes, and the integral value identifying them is the lane ID. + The innermost grouping of threads is called a warp. A warp is the most tightly + coupled groups of threads, both physically and logically. Threads inside a warp + are executed in lockstep, with each thread executing the same instruction. Threads + in a warp are also called lanes, and the value identifying them is the lane ID. .. tip:: @@ -187,41 +270,53 @@ Warp (or Wavefront) calculated values to be. The size of a warp is architecture dependent and always fixed. For AMD GPUs - the wavefront is typically 64 threads, though sometimes 32 threads. Warps are + the warp is typically 64 threads, though sometimes 32 threads. Warps are signified by the set of communication primitives at their disposal, as discussed in :ref:`warp-cross-lane`. .. _inherent_thread_hierarchy_block: Block - The middle grouping is called a block or thread block. The defining feature - of a block is that all threads in a block will share an instance of memory - which they may use to share data or synchronize with one another. - - The size of a block is user-configurable but is limited by the queryable - capabilities of the executing hardware. The unique ID of the thread within a - block is 3-dimensional as provided by the API. When linearizing thread IDs - within a block, assume the "fast index" being dimension ``x``, followed by - the ``y`` and ``z`` dimensions. + The next level of the thread hierarchy is called a thread block, or block. The + defining feature of a block is that all threads in the block have shared memory + that they can use to share data or synchronize with one another, as described in + :ref:`memory_hierarchy`. + + The size of a block, or the block dimension, is the user-configurable number of + threads per block, but is limited by the queryable capabilities of the executing + hardware. The unique ID of the thread within a block can be 1, 2, or 3-dimensional + as provided by the HIP API. You can configure the thread block to best represent + the data associated with the kernel instruction set. + + .. note:: + When linearizing thread IDs within a block, assume the *fast index* is the ``x`` + dimension, followed by the ``y`` and ``z`` dimensions. .. _inherent_thread_hierarchy_grid: Grid - The outermost grouping is called a grid. A grid manifests as a single - dispatch of kernels for execution. The unique ID of each block within a grid - is 3-dimensional, as provided by the API and is queryable by every thread - within the block. + The top-most level of the thread hierarchy is a grid. A grid is the number of blocks + needed for a single launch of the kernel. The unique ID of each block within + a grid can be 1, 2, or 3-dimensional, as provided by the API and is queryable + by every thread within the block. + +The three-dimensional thread hierarchy available to a kernel program lends itself to solutions +that align closely to the computational problem. The following are some examples: + +* 1 dimensional: array processing, linear data structures, or sequential data transformation +* 2 dimensional: Image processing, matrix operations, 2 dimensional simulations +* 3 dimensions: Volume rendering, 3D scientific simulations, spatial algorithms Cooperative groups thread model ------------------------------- -The Cooperative groups API introduces new APIs to launch, group, subdivide, +The Cooperative groups API introduces new functions to launch, group, subdivide, synchronize and identify threads, as well as some predefined group-collective -algorithms, but most importantly a matching threading model to think in terms of. -It relaxes some restrictions of the :ref:`inherent_thread_model` imposed by the -strict 1:1 mapping of architectural details to the programming model. Cooperative -groups let you define your own set of thread groups which may fit your user-cases -better than the defaults defined by the hardware. +algorithms. Most importantly it offers a matching thread model to think of the +cooperative groups in terms of. It relaxes some restrictions of the :ref:`inherent_thread_model` +imposed by the strict 1:1 mapping of architectural details to the programming model. +Cooperative groups let you define your own set of thread groups which may better +fit your use-case than the defaults defined by the hardware. .. note:: The implicit groups defined by kernel launch parameters are still available @@ -229,14 +324,15 @@ better than the defaults defined by the hardware. For further information, see :doc:`Cooperative groups `. +.. _memory_hierarchy: + Memory model ============ -The hierarchy of threads introduced by the :ref:`inherent_thread_model` is induced -by the memory subsystem of GPUs. The following figure summarizes the memory -namespaces and how they relate to the various levels of the threading model. +The thread structure of the :ref:`inherent_thread_model` is supported by the memory +subsystem of GPUs. The following figure summarizes the memory namespaces and how +they relate to the various levels of the threading model. -.. _memory_hierarchy: .. figure:: ../data/understand/programming_model/memory_hierarchy.svg :alt: Diagram depicting nested rectangles of varying color. The outermost one @@ -250,10 +346,11 @@ namespaces and how they relate to the various levels of the threading model. Local or per-thread memory Read-write storage only visible to the threads defining the given variables, - also called per-thread memory. The size of a block for a given kernel, and thereby - the number of concurrent warps, are limited by local memory usage. - This relates to an important aspect: occupancy. This is the default memory - namespace. + also called per-thread memory. This is the default memory namespace. + The size of the blocks for a given kernel, and thereby the number of concurrent + warps, are limited by local memory usage. This relates to the *occupancy* of the + CU as described in :doc:`Compute Units <./hardware_implementation>`, + an important concept in resource usage and performance optimization. Shared memory Read-write storage visible to all the threads in a given block. @@ -274,10 +371,60 @@ Global Surface A read-write version of texture memory. +Using different memory types +---------------------------- + +* Use global memory when: + + - You are transferring data from the host to the device + - You have large data sets, and latency isn't an issue + - You are sharing data between thread blocks + +* Use shared memory when: + + - The data is reused within a thread block + - Cross-thread communication is needed + - To reduce global memory bandwidth + +* Use local memory when: + + - The data is specific to a thread + - To store automatic variables for the thread + - To provide register pressure relief for the thread + +* Use constant memory when: + + - The data is read-only + - The same value is used across threads + - The data size is small + +Memory access patterns and best practices +----------------------------------------- + +While you should refer to the :ref:`memory_management`, the following are a few memory +access patterns and best practices: + +* Global memory: Coalescing reduces memory transactions. +* Shared memory: Avoiding bank conflicts is crucial. +* Texture memory: Spatial locality improves caching. +* Unified memory: Structured access minimizes page migration overhead. + +When a kernel accesses global memory, the memory transactions typically occur in chunks of 32, 64, or 128 bytes. If threads access memory in a coalesced manner, meaning consecutive threads read or write consecutive memory locations, the memory controller can merge these accesses into a single transaction. Coalesced access primarily applies to global memory, which is the largest but slowest type of memory on a GPU and coalesced access significantly improves performance by reducing memory latency and increasing bandwidth efficiency. + +To achieve coalesced memory access in HIP, ensure that memory addresses accessed by consecutive threads are aligned. Structure data for coalesced access by storing it in a contiguous manner so that thread[i] can access array[i], and not some random location. Avoid strided access patterns, for example array[i * stride] can lead to memory bank conflicts and inefficient access. If all the threads in a warp can access consecutive memory locations, memory access is fully coalesced. + +Shared memory is a small, fast memory region inside the CU. Unlike global memory, shared memory accesses do not require coalescing, but they can suffer from bank conflicts, which are another form of inefficient memory access. Shared memory is divided into multiple memory banks (usually 32 banks on modern GPUs). If multiple threads within a warp try to access different addresses that map to the same memory bank, accesses get serialized, leading to poor performance. To optimize shared memory usage ensure that consecutive threads access different memory banks. Use padding if necessary to avoid conflicts. + +Texture memory is read-only memory optimized for spatial locality and caching rather than coalescing. Texture memory is cached, unlike standard global memory, and it provides optimized access patterns for 2D and spatially local data. Accessing neighboring values results in cache hits, improving performance. Therefore, instead of worrying about coalescing, optimal memory access patterns involve ensuring that threads access spatially adjacent texture elements, and the memory layout aligns well with the 2D caching mechanism. + +Unified memory allows the CPU and GPU to share memory seamlessly, but performance depends on access patterns. Unified memory enables automatic page migration between CPU and GPU memory. However, if different threads access different pages, it can lead to expensive page migrations and slow throughput performance. Accessing unified memory in a structured, warp-friendly manner reduces unnecessary page transfers. Ensure threads access memory in a structured, consecutive manner, minimizing page faults. Prefetch data to the GPU before computation by using ``hipMemPrefetchAsync()``. In addition, using small batch transfers as described below, can reduce unexpected page migrations when using unified memory. + +Memory transfers between the host and the device can become a major bottleneck if not optimized. One method is to use small batch memory transfers where data is transferred in smaller chunks instead of a dealing with large datasets to avoid long blocking operations. Small batch transfers offer better PCIe bandwidth utilization over large data transfers. Small batch transfers offer performance improvement by offering reduced latency with small batches that run asynchronously using ``hipMemcpyAsync()`` as described in :ref:`asynchronous_how-to`, pipelining data transfers and kernel execution using separate streams. Finally, using pinned memory with small batch transfers enables faster DMA transfers without CPU involvement, greatly improving memory transfer performance. + Execution model =============== -HIP programs consist of two distinct scopes: +As previously discussed in :ref:`heterogeneous_programming`, HIP programs consist of two distinct scopes: * The host-side API running on the host processor. There are two APIs available: @@ -362,4 +509,3 @@ intended use-cases. compiler itself and not intended towards end-user code. Should you be writing a tool having to launch device code using HIP, consider using these over the alternatives. - From e69e764acabf27c91a2a363623de2f7c6db28834 Mon Sep 17 00:00:00 2001 From: Istvan Kiss Date: Tue, 11 Mar 2025 12:58:56 +0100 Subject: [PATCH 16/32] Update docs/understand/programming_model.rst Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com> --- .wordlist.txt | 3 +- .../cpu-gpu-comparison.drawio | 181 +++++++++++ .../programming_model/cpu-gpu-comparison.svg | 1 + .../programming_model/host-device-flow.drawio | 61 ++++ .../programming_model/host-device-flow.svg | 1 + .../programming_model/memory-access.drawio | 237 ++++++++++++++ .../programming_model/memory-access.svg | 1 + .../programming_model/multi-gpu.drawio | 64 ++++ .../programming_model/multi-gpu.svg | 1 + .../programming_model/simt-execution.drawio | 124 ++++++++ .../programming_model/simt-execution.svg | 1 + .../understand/programming_model/simt.drawio | 148 --------- .../understand/programming_model/simt.svg | 1 - .../programming_model/stream-workflow.drawio | 97 ++++++ .../programming_model/stream-workflow.svg | 1 + docs/index.md | 1 - docs/programming_guide.rst | 83 ----- docs/sphinx/_toc.yml.in | 2 - docs/understand/programming_model.rst | 298 ++++++++++-------- 19 files changed, 933 insertions(+), 373 deletions(-) create mode 100644 docs/data/understand/programming_model/cpu-gpu-comparison.drawio create mode 100644 docs/data/understand/programming_model/cpu-gpu-comparison.svg create mode 100644 docs/data/understand/programming_model/host-device-flow.drawio create mode 100644 docs/data/understand/programming_model/host-device-flow.svg create mode 100644 docs/data/understand/programming_model/memory-access.drawio create mode 100644 docs/data/understand/programming_model/memory-access.svg create mode 100644 docs/data/understand/programming_model/multi-gpu.drawio create mode 100644 docs/data/understand/programming_model/multi-gpu.svg create mode 100644 docs/data/understand/programming_model/simt-execution.drawio create mode 100644 docs/data/understand/programming_model/simt-execution.svg delete mode 100644 docs/data/understand/programming_model/simt.drawio delete mode 100644 docs/data/understand/programming_model/simt.svg create mode 100644 docs/data/understand/programming_model/stream-workflow.drawio create mode 100644 docs/data/understand/programming_model/stream-workflow.svg delete mode 100644 docs/programming_guide.rst diff --git a/.wordlist.txt b/.wordlist.txt index fab10155f6..7bc0f65fa0 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -9,6 +9,7 @@ AXPY asm Asynchronicity Asynchrony +asynchrony backtrace bfloat Bitcode @@ -93,7 +94,6 @@ iteratively Lapack latencies libc -libhipcxx libstdc lifecycle linearizing @@ -176,6 +176,7 @@ ULP ULPs unintuitive UMM +uncoalesced unmap unmapped unmapping diff --git a/docs/data/understand/programming_model/cpu-gpu-comparison.drawio b/docs/data/understand/programming_model/cpu-gpu-comparison.drawio new file mode 100644 index 0000000000..a7e851b3d5 --- /dev/null +++ b/docs/data/understand/programming_model/cpu-gpu-comparison.drawio @@ -0,0 +1,181 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/data/understand/programming_model/cpu-gpu-comparison.svg b/docs/data/understand/programming_model/cpu-gpu-comparison.svg new file mode 100644 index 0000000000..552290299f --- /dev/null +++ b/docs/data/understand/programming_model/cpu-gpu-comparison.svg @@ -0,0 +1 @@ +
CPU versus GPU Architecture
CPU versus GPU Archite...
CPU
CPU
CPU Core
CPU Core
CPU Core
CPU Core
CPU Core
CPU Core
CPU Core
CPU Core
GPU
GPU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
CU
Large Complex Cores
Large Complex Cores
High Clock Speed (3-5 GHz)
High Clock Speed (3-5 GHz)
Many Simple Cores
Many Simple Cores
Lower Clock Speed (1-2 GHz)
Lower Clock Speed (1-2 GHz)
Large Cache per Core
Large Cache per Core
Shared Memory across Cores
Shared Memory across Cores
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/data/understand/programming_model/host-device-flow.drawio b/docs/data/understand/programming_model/host-device-flow.drawio new file mode 100644 index 0000000000..2ee8c43ae9 --- /dev/null +++ b/docs/data/understand/programming_model/host-device-flow.drawio @@ -0,0 +1,61 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/data/understand/programming_model/host-device-flow.svg b/docs/data/understand/programming_model/host-device-flow.svg new file mode 100644 index 0000000000..02bce96c5d --- /dev/null +++ b/docs/data/understand/programming_model/host-device-flow.svg @@ -0,0 +1 @@ +
Host-Device Data Flow
Host-Device Data Flow
Host (CPU)
Host (CPU)
Device (GPU)
Device (GPU)
1. Initialize
1. Initialize
2. Transfer Data
2. Transfer Data
3. Execute Kernel
3. Execute Kernel
4. Return Results
4. Return Results
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/data/understand/programming_model/memory-access.drawio b/docs/data/understand/programming_model/memory-access.drawio new file mode 100644 index 0000000000..3577772532 --- /dev/null +++ b/docs/data/understand/programming_model/memory-access.drawio @@ -0,0 +1,237 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/data/understand/programming_model/memory-access.svg b/docs/data/understand/programming_model/memory-access.svg new file mode 100644 index 0000000000..5f0dbd8aae --- /dev/null +++ b/docs/data/understand/programming_model/memory-access.svg @@ -0,0 +1 @@ +
Memory Access Patterns
Memory Access Patterns
Uncoalesced Access
Uncoalesced Access
Threads
Threads
Memory
Memory
Coalesced Access
Coalesced Access
Threads
Threads
Memory
Memory
0
0
...
...
...
...
63
63
0
0
...
...
...
...
63
63
0
0
...
...
...
...
63
63
0
0
...
...
...
...
63
63
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/data/understand/programming_model/multi-gpu.drawio b/docs/data/understand/programming_model/multi-gpu.drawio new file mode 100644 index 0000000000..17eca3c318 --- /dev/null +++ b/docs/data/understand/programming_model/multi-gpu.drawio @@ -0,0 +1,64 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/data/understand/programming_model/multi-gpu.svg b/docs/data/understand/programming_model/multi-gpu.svg new file mode 100644 index 0000000000..190f2593d2 --- /dev/null +++ b/docs/data/understand/programming_model/multi-gpu.svg @@ -0,0 +1 @@ +
Multi-GPU Workload Distribution
Multi-GPU Workload Distribution
Host CPU
Host CPU
GPU 0
GPU 0
GPU 1
GPU 1
GPU 2
GPU 2
GPU 3
GPU 3
25%
25%
25%
25%
25%
25%
25%
25%
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/data/understand/programming_model/simt-execution.drawio b/docs/data/understand/programming_model/simt-execution.drawio new file mode 100644 index 0000000000..1e2652f51f --- /dev/null +++ b/docs/data/understand/programming_model/simt-execution.drawio @@ -0,0 +1,124 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/data/understand/programming_model/simt-execution.svg b/docs/data/understand/programming_model/simt-execution.svg new file mode 100644 index 0000000000..412b9265e7 --- /dev/null +++ b/docs/data/understand/programming_model/simt-execution.svg @@ -0,0 +1 @@ +
SIMT Execution Model
SIMT Execution Model
a[i] = b[i] + c[i]
a[i] = b[i] + c[i]
Thread 0
Thread 0
b[0] = 5
b[0] = 5
c[0] = 3
c[0] = 3
a[0] = 8
a[0] = 8
Thread 1
Thread 1
b[1] = 2
b[1] = 2
c[1] = 4
c[1] = 4
a[1] = 6
a[1] = 6
Thread 2
Thread 2
b[2] = 7
b[2] = 7
c[2] = 1
c[2] = 1
a[2] = 8
a[2] = 8
Thread 3
Thread 3
b[3] = 3
b[3] = 3
c[3] = 5
c[3] = 5
a[3] = 8
a[3] = 8
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/data/understand/programming_model/simt.drawio b/docs/data/understand/programming_model/simt.drawio deleted file mode 100644 index 4c5c5a3f26..0000000000 --- a/docs/data/understand/programming_model/simt.drawio +++ /dev/null @@ -1,148 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/data/understand/programming_model/simt.svg b/docs/data/understand/programming_model/simt.svg deleted file mode 100644 index c149ab88e4..0000000000 --- a/docs/data/understand/programming_model/simt.svg +++ /dev/null @@ -1 +0,0 @@ -
ADD
ADD
FM
FMA
FMA
FM
FMA
FMA
FM
FMA
FMA
FM
FMA
FMA
DIV
DIV
ADD
ADD
FM
FMA
FMA
FM
FMA
FMA
FM
FMA
FMA
FM
FMA
FMA
DIV
DIV
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/data/understand/programming_model/stream-workflow.drawio b/docs/data/understand/programming_model/stream-workflow.drawio new file mode 100644 index 0000000000..616dd28d78 --- /dev/null +++ b/docs/data/understand/programming_model/stream-workflow.drawio @@ -0,0 +1,97 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/data/understand/programming_model/stream-workflow.svg b/docs/data/understand/programming_model/stream-workflow.svg new file mode 100644 index 0000000000..9648351cad --- /dev/null +++ b/docs/data/understand/programming_model/stream-workflow.svg @@ -0,0 +1 @@ +
Stream and Event Workflow
Stream and Event Workf...
Stream 1
Stream 1
Stream 2
Stream 2
Stream 3
Stream 3
Operation
Operation
Event
Event
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/index.md b/docs/index.md index 23f352e306..9be67a91d3 100644 --- a/docs/index.md +++ b/docs/index.md @@ -22,7 +22,6 @@ The HIP documentation is organized into the following categories: :::{grid-item-card} Programming guide -* [Introduction](./programming_guide) * {doc}`./understand/programming_model` * {doc}`./understand/hardware_implementation` * {doc}`./understand/compilers` diff --git a/docs/programming_guide.rst b/docs/programming_guide.rst deleted file mode 100644 index 7444408866..0000000000 --- a/docs/programming_guide.rst +++ /dev/null @@ -1,83 +0,0 @@ -.. meta:: - :description: HIP programming guide introduction - :keywords: HIP programming guide introduction, HIP programming guide - -.. _hip-programming-guide: - -******************************************************************************** -HIP programming guide introduction -******************************************************************************** - -This topic provides key HIP programming concepts and links to more detailed -information. - -Write GPU Kernels for Parallel Execution -================================================================================ - -To make the most of the parallelism inherent to GPUs, a thorough understanding -of the :ref:`programming model ` is helpful. The HIP -programming model is designed to make it easy to map data-parallel algorithms to -architecture of the GPUs. HIP employs the SIMT-model (Single Instruction -Multiple Threads) with a multi-layered thread hierarchy for efficient execution. - -Understand the Target Architecture (CPU and GPU) -================================================================================ - -The :ref:`hardware implementation ` topic outlines the -GPUs supported by HIP. In general, GPUs are made up of Compute Units that excel -at executing parallelizable, computationally intensive workloads without complex -control-flow. - -Increase parallelism on multiple level -================================================================================ - -To maximize performance and keep all system components fully utilized, the -application should expose and efficiently manage as much parallelism as possible. -:ref:`Parallel execution ` can be achieved at the -application, device, and multiprocessor levels. - -The application’s host and device operations can achieve parallel execution -through asynchronous calls, streams, or HIP graphs. On the device level, -multiple kernels can execute concurrently when resources are available, and at -the multiprocessor level, developers can overlap data transfers with -computations to further optimize performance. - -Memory management -================================================================================ - -GPUs generally have their own distinct memory, also called :ref:`device -memory `, separate from the :ref:`host memory `. -Device memory needs to be managed separately from the host memory. This includes -allocating the memory and transfering it between the host and the device. These -operations can be performance critical, so it's important to know how to use -them effectively. For more information, see :ref:`Memory management `. - -Synchronize CPU and GPU Workloads -================================================================================ - -Tasks on the host and devices run asynchronously, so proper synchronization is -needed when dependencies between those tasks exist. The asynchronous execution -of tasks is useful for fully utilizing the available resources. Even when only a -single device is available, memory transfers and the execution of tasks can be -overlapped with asynchronous execution. - -Error Handling -================================================================================ - -All functions in the HIP runtime API return an error value of type -:cpp:enum:`hipError_t` that can be used to verify whether the function was -successfully executed. It's important to confirm these returned values, in order -to catch and handle those errors, if possible. An exception is kernel launches, -which don't return any value. These errors can be caught with specific functions -like :cpp:func:`hipGetLastError()`. - -For more information, see :ref:`error_handling` . - -Multi-GPU and Load Balancing -================================================================================ - -Large-scale applications that need more compute power can use multiple GPUs in -the system. This requires distributing workloads across multiple GPUs to balance -the load to prevent GPUs from being overutilized while others are idle. - -For more information, see :ref:`multi-device` . \ No newline at end of file diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index 2f08ffcd5a..dacc58d884 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -24,8 +24,6 @@ subtrees: - caption: Programming guide entries: - - file: programming_guide - title: Introduction - file: understand/programming_model - file: understand/hardware_implementation - file: understand/compilers diff --git a/docs/understand/programming_model.rst b/docs/understand/programming_model.rst index 64a92df470..3cac7e374a 100644 --- a/docs/understand/programming_model.rst +++ b/docs/understand/programming_model.rst @@ -7,36 +7,41 @@ .. _programming_model: ******************************************************************************* -Introduction to HIP programming model +Introduction to the HIP programming model ******************************************************************************* -The HIP programming model makes it easy to map data-parallel C/C++ algorithms to -massively parallel, wide single instruction, multiple data (SIMD) architectures, -such as GPUs. HIP supports many imperative languages, such as Python via PyHIP, -but this document focuses on the original C/C++ API of HIP. +The HIP programming model enables mapping data-parallel C/C++ algorithms to massively +parallel SIMD (Single Instruction, Multiple Data) architectures like GPUs. HIP +supports many imperative languages, such as Python via PyHIP, but this document +focuses on the original C/C++ API of HIP. While GPUs may be capable of running applications written for CPUs if properly ported -and compiled, it would not be an efficient use of GPU resources. GPUs are different -from CPUs in fundamental ways, and should be used accordingly to achieve optimum +and compiled, it would not be an efficient use of GPU resources. GPUs fundamentally differ +from CPUs and should be used accordingly to achieve optimum performance. A basic understanding of the underlying device architecture helps you make efficient use of HIP and general purpose graphics processing unit (GPGPU) programming in general. The following topics introduce you to the key concepts of -GPU-based programming, and the HIP programming model. +GPU-based programming and the HIP programming model. -Getting into Hardware: CPU vs GPU -================================= +Hardware differences: CPU vs GPU +================================ -CPUs and GPUs have been designed for different purposes. CPUs have been designed -to quickly execute a single thread, decreasing the time it takes for a single -operation, increasing the amount of sequential instructions that can be executed. -This includes fetching data, and reducing pipeline stalls where the ALU has to -wait for previous instructions to finish. +CPUs and GPUs have been designed for different purposes. CPUs quickly execute a single thread, decreasing the time for a single operation while increasing the number of sequential instructions that can be executed. This includes fetching data and reducing pipeline stalls where the ALU has to wait for previous instructions to finish. -On CPUs the goal is to quickly process operations. CPUs provide low latency processing for +.. figure:: ../data/understand/programming_model/cpu-gpu-comparison.svg + :alt: Diagram depicting the differences between CPU and GPU hardware. + The CPU block shows four large processing cores, lists Large Cache per + Core, and High Clock Speed of 3 to 5 gigahertz. The GPU block shows 42 + smaller processing cores, lists Shared Memory across Cores, and Lower + Clock Speeds of 1 to 2 gigahertz. + + Differences in CPUs and GPUs + +With CPUs, the goal is to quickly process operations. CPUs provide low-latency processing for serial instructions. On the other hand, GPUs have been designed to execute many similar commands, or threads, -in parallel, achieving higher throughput. Latency is the delay from when an operation -is started to when it returns, such as 2 ns, while throughput is the number of operations completed -in a period of time, such as ten thousand threads completed. +in parallel, achieving higher throughput. Latency is the time between starting an +operation and receiving its result, such as 2 ns, while throughput is the rate of +completed operations, for example, operations per second. For the GPU, the objective is to process as many operations in parallel, rather than to finish a single instruction quickly. GPUs in general are made up of basic @@ -45,7 +50,7 @@ As described in :ref:`hardware_implementation`, these CUs provide the necessary resources for the threads: the Arithmetic Logical Units (ALUs), register files, caches and shared memory for efficient communication between the threads. -The following defines a few hardware differences between CPUs and GPUs: +The following describes a few hardware differences between CPUs and GPUs: * CPU: @@ -69,8 +74,8 @@ The following defines a few hardware differences between CPUs and GPUs: - Register files are shared among threads. The number of threads that can be run in parallel depends on the registers needed per thread. - Multiple ALUs execute a collection of threads having the same operations, also known as a wavefront or warp. This is called single-instruction, multiple threads (SIMT) operation as described in :ref:`programming_model_simt`. - - The collection of ALUs is called SIMD. SIMDs are an extension to the hardware architecture, that allows a `single instruction` to concurrently operate on `multiple data` inputs. - - For branching threads where conditional instructions lead to thread divergence, ALUs still processes the full wavefront, but the result for divergent threads is masked out. This leads to wasted ALU cycles, and should be a consideration in your programming. Keep instructions consistent, and leave conditionals out of threads. + - The collection of ALUs is called SIMD. SIMDs are an extension to the hardware architecture that allows a `single instruction` to concurrently operate on `multiple data` inputs. + - For branching threads where conditional instructions lead to thread divergence, ALUs still process the full wavefront, but the result for divergent threads is masked out. This leads to wasted ALU cycles and should be a consideration in your programming. Keep instructions consistent and leave conditionals out of threads. - The advantage for GPUs is that context switching is easy. All threads that run on a core/compute unit have their registers on the compute unit, so they don't need to be stored to global memory, and each cycle one instruction from any wavefront that resides on the compute unit can be issued. @@ -82,7 +87,7 @@ across large datasets, with little branching, where the volume of operations is .. _heterogeneous_programming: -Heterogeneous Programming +Heterogeneous programming ========================= The HIP programming model has two execution contexts. The main application starts on the CPU, or @@ -127,13 +132,21 @@ In heterogeneous programming, the CPU is available for processing operations but 1. Initialize the HIP runtime and select the GPU: As described in :ref:`initialization`, refers to identifying and selecting a target GPU, setting up a context to let the CPU interact with the GPU. 2. Data preparation: As discussed in :ref:`memory_management`, this includes allocating the required memory on the host and device, preparing input data and transferring it from the host to the device. The data is both transferred to the device, and passed as an input parameter when launching the kernel. -3. Configure and launch the kernel on the GPU: As described in :ref:`device_program`, define and load the kernel or kernels to be run, launch kernels using the triple chevron syntax or appropriate API call (for example ``hipLaunchKernelGGL``), and pass parameters as needed. On the GPU, kernels are run on streams, or a queue of operations. Within the same stream operations run in the order they were issued, but different streams are independent and can execute concurrently. In the HIP runtime, kernels run on the default stream when one is not specified, but specifying a stream for the kernel lets you increase concurrency in task scheduling and resource utilization, and launch and manage multiple kernels from the host program. +3. Configure and launch the kernel on the GPU: As described in :ref:`device_program`, this defines kernel configurations and arguments, launches kernel to run on the GPU device using the triple chevron syntax or appropriate API call (for example ``hipLaunchKernelGGL``). On the GPU, multiple kernels can run on streams, with a queue of operations. Within the same stream, operations run in the order they were issued, but on multiple streams operations are independent and can execute concurrently. In the HIP runtime, kernels run on the default stream when one is not specified, but specifying a stream for the kernel lets you increase concurrency in task scheduling and resource utilization, and launch and manage multiple kernels from the host program. 4. Synchronization: As described in :ref:`asynchronous_how-to`, kernel execution occurs in the context of device streams, specifically the default (`0`) stream. You can use streams and events to manage task dependencies, overlap computation with data transfers, and manage asynchronous processes to ensure proper sequencing of operations. Wait for events or streams to finish execution and transfer results from the GPU back to the host. 5. Error handling: As described in :ref:`error_handling`, you should catch and handle potential errors from API calls, kernel launches, or memory operations. For example, use ``hipGetErrorString`` to retrieve error messages. 6. Cleanup and resource management: Validate results, clean up GPU contexts and resources, and free allocated memory on the host and devices. This structure allows for efficient use of GPU resources and facilitates the acceleration of compute-intensive tasks while keeping the host CPU available for other tasks. +.. figure:: ../data/understand/programming_model/host-device-flow.svg + :alt: Diagram depicting a host CPU and device GPU rectangles of varying color. + There are arrows pointing between the rectangles showing from the Host + to the Device the initialization, data transfer, and Kernel execution + steps, and from the Device back to the Host the returning results. + + Interaction of Host and Device in a GPU application + .. _device_program: Device programming @@ -141,30 +154,40 @@ Device programming The device or kernel program acts as workers on the GPU application, distributing operations to be handled quickly and efficiently. Launching a kernel in the host application starts the kernel program running on the GPU, defining the parallel operations to repeat the same instructions across many datasets. Understanding how the kernel works and the processes involved is essential to writing efficient GPU applications. Threads, blocks, and grids provide a hierarchical approach to parallel operations. Understanding the thread hierarchy is critical to distributing work across the available CUs, managing parallel operations, and optimizing memory access. The general flow of the kernel program looks like this: -1. Thread Grouping: As described in :ref:`inherent_thread_model`, threads are organized into a hierarchy consisting of threads which are individual instances of parallel operations, blocks that group the threads together, and grids that group blocks into the kernel. Each thread runs an instance of the kernel in parallel with other threads in the block. +1. Thread Grouping: As described in :ref:`inherent_thread_model`, threads are organized into a hierarchy consisting of threads, which are individual instances of parallel operations, blocks that group the threads, and grids that group blocks into the kernel. Each thread runs an instance of the kernel in parallel with other threads in the block. 2. Indexing: The kernel computes the unique index for each thread to access the relevant data to be processed by the thread. 3. Data Fetch: Threads fetch input data from memory previously transferred from the host to the device. As described in :ref:`memory_hierarchy`, the hierarchy of threads is influenced by the memory subsystem of GPUs. The memory hierarchy includes local memory per-thread with very fast access, shared memory for the block of threads which also supports quick access, and larger amounts of global memory visible to the whole kernel,but accesses are expensive due to high latency. Understanding the memory model is a key concept for kernel programming. 4. Computation: Threads perform the required computations on the input data, and generate any needed output. Each thread of the kernel runs the same instruction simultaneously on the different datasets. This sometimes require multiple iterations when the number of operations exceeds the resources of the CU. 5. Synchronization: When needed, threads synchronize within their block to ensure correct results when working with shared memory. -Kernels can be simple single instruction programs deployed across multiple threads in wavefronts, as described below and as demonstrated in the `Hello World tutorial `_ or :doc:`../tutorial/saxpy`. However, heterogeneous GPU applications can also become quite complex, managing hundreds, thousands, or hundreds of thousands of operations with repeated data transfers between host and device to support massive parallelization, using multiple streams to manage concurrent asynchronous operations, using rich libraries of functions optimized for GPU hardware as described in the `ROCm documentation `_. +Kernels are parallel programs that execute the same instruction set across multiple threads, organized in wavefronts, as described below and as demonstrated in the `Hello World tutorial `_ or :doc:`../tutorial/saxpy`. However, heterogeneous GPU applications can also become quite complex, managing hundreds, thousands, or hundreds of thousands of operations with repeated data transfers between host and device to support massive parallelization, using multiple streams to manage concurrent asynchronous operations, using rich libraries of functions optimized for GPU hardware as described in the `ROCm documentation `_. .. _programming_model_simt: Single instruction multiple threads (SIMT) ========================================== -The HIP kernel code, which is written as a series of scalar instructions for multiple +The HIP kernel code, written as a series of scalar instructions for multiple threads with different thread indices, gets mapped to the SIMD units of the GPUs. Every single instruction, which is executed for every participating thread of a kernel, gets mapped to the SIMD. This is done by grouping threads into warps, which contain as many threads as there are physical lanes in a SIMD, and issuing that instruction to the SIMD for every -warp of a kernel. Ideally the SIMD is always fully utilized, however if the number of threads +warp of a kernel. Ideally, the SIMD is always fully utilized. However, if the number of threads can't be evenly divided by the warpSize, then the unused lanes are masked out from the corresponding SIMD execution. +.. _simt: + +.. figure:: ../data/understand/programming_model/simt-execution.svg + :alt: Diagram depicting the SIMT execution model. There is a red rectangle + which contains the expression a[i] = b[i] + c[i], and below that four + arrows that point to Thread 0,1,2, and 3. Each thread contains different + values for b, c, and a, showing the parallel operations of this equation. + + Instruction flow of a sample SIMT program + A kernel follows the same C++ rules as the functions on the host, but it has a special ``__global__`` label to mark it for execution on the device, as shown in the following example: .. code-block:: cpp @@ -188,7 +211,7 @@ called the triple chevron syntax, which looks like the following: AddKernel<<>>(a, b); -Inside the angle brackets you provide the following: +Inside the angle brackets, provide the following: * The number of blocks to launch, which defines the grid size (relating to blockDim). * The number of threads in a block, which defines the block size (relating to blockIdx). @@ -198,7 +221,7 @@ Inside the angle brackets you provide the following: .. note:: The kernel can also be launched through other methods, such as the ``hipLaunchKernel()`` function. -Here the total number of threads launched for the ``AddKernel`` program is defined by +Here, the total number of threads launched for the ``AddKernel`` program is defined by ``number_of_blocks * threads_per_block``. You define these values when launching the kernel program to address the problem to be solved with the available resources within the system. In other words, the thread configuration is customized to the needs of the @@ -217,16 +240,6 @@ execution, one thread down each lane of an SIMD engine. Execution parallelism usually isn't exploited from the width of the built-in vector types, but across multiple threads via the thread ID constants ``threadIdx.x``, ``blockIdx.x``, etc. -.. _simt: - -.. figure:: ../data/understand/programming_model/simt.svg - :alt: Image representing the instruction flow of a SIMT program. Two identical - arrows pointing downward with blocks representing the instructions - inside and ellipsis between the arrows. The instructions represented in - the arrows are, from top to bottom: ADD, DIV, FMA, FMA, FMA and FMA. - - Instruction flow of a sample SIMT program. - .. _inherent_thread_model: Hierarchical thread model @@ -239,7 +252,7 @@ blocks, and grids. * Threads are single instances of kernel operations, running concurrently across warps * Blocks group threads together and enable cooperation and shared memory * Grids define the number of thread blocks for a single kernel launch -* Blocks, and grids can be defined in 3 dimensions (``x``, ``y``, ``z``) +* Blocks and grids can be defined in 3 dimensions (``x``, ``y``, ``z``) * By default, the Y and Z dimensions are set to 1 The combined values represent the thread index, and relate to the sequence that the @@ -303,20 +316,19 @@ Grid The three-dimensional thread hierarchy available to a kernel program lends itself to solutions that align closely to the computational problem. The following are some examples: -* 1 dimensional: array processing, linear data structures, or sequential data transformation -* 2 dimensional: Image processing, matrix operations, 2 dimensional simulations -* 3 dimensions: Volume rendering, 3D scientific simulations, spatial algorithms +* 1-dimensional: array processing, linear data structures, or sequential data transformation +* 2-dimensional: Image processing, matrix operations, 2 dimensional simulations +* 3-dimensional: Volume rendering, 3D scientific simulations, spatial algorithms Cooperative groups thread model ------------------------------- The Cooperative groups API introduces new functions to launch, group, subdivide, synchronize and identify threads, as well as some predefined group-collective -algorithms. Most importantly it offers a matching thread model to think of the -cooperative groups in terms of. It relaxes some restrictions of the :ref:`inherent_thread_model` -imposed by the strict 1:1 mapping of architectural details to the programming model. -Cooperative groups let you define your own set of thread groups which may better -fit your use-case than the defaults defined by the hardware. +algorithms. Cooperative groups let you define your own set of thread groups which +may fit your use-cases better than those defined by the hardware. It relaxes some +restrictions of the :ref:`inherent_thread_model` imposed by the strict 1:1 mapping +of architectural details to the programming model. .. note:: The implicit groups defined by kernel launch parameters are still available @@ -329,10 +341,12 @@ For further information, see :doc:`Cooperative groups `, an important concept in resource usage and performance optimization. + Use local memory when the data is specific to a thread, to store variables generated + by the thread, or to provide register pressure relief for the thread. + Shared memory - Read-write storage visible to all the threads in a given block. + Read-write storage visible to all the threads in a given block. Use shared memory + when the data is reused within a thread block, when cross-thread communication + is needed, or to minimize global memory transactions by using device memory + whenever possible. Global Read-write storage visible to all threads in a given grid. There are specialized versions of global memory with different usage semantics which - are typically backed by the same hardware storing global. + are typically backed by the same hardware storing global. + + Use global memory when you have large datasets, are transferring memory between + the host and the device, and when you are sharing data between thread blocks. Constant Read-only storage visible to all threads in a given grid. It is a limited - segment of global with queryable size. + segment of global with queryable size. Use constant memory for read-only data + that is shared across multiple threads, and that has a small data size. Texture Read-only storage visible to all threads in a given grid and accessible @@ -371,104 +395,86 @@ Global Surface A read-write version of texture memory. -Using different memory types ----------------------------- - -* Use global memory when: - - - You are transferring data from the host to the device - - You have large data sets, and latency isn't an issue - - You are sharing data between thread blocks +Memory optimizations and best practices +--------------------------------------- -* Use shared memory when: +.. figure:: ../data/understand/programming_model/memory-access.svg + :alt: Diagram depicting an example memory access pattern for coalesced memory. + The diagram has uncoalesced access on the left side, with consecutive + threads accessing memory in a random pattern. With coalesced access on the + right showing consecutive threads accessing consecutive memory addresses. - - The data is reused within a thread block - - Cross-thread communication is needed - - To reduce global memory bandwidth + Coalesced memory accesses -* Use local memory when: +The following are a few memory access patterns and best practices to improve performance. You can find additional information in :ref:`memory_management` and :doc:`../how-to/performance_guidelines`. - - The data is specific to a thread - - To store automatic variables for the thread - - To provide register pressure relief for the thread +* **Global memory**: Coalescing reduces the number of memory transactions. -* Use constant memory when: + Coalesced memory access in HIP refers to the optimization of memory transactions to maximize throughput when accessing global memory. When a kernel accesses global memory, the memory transactions typically occur in chunks of 32, 64, or 128 bytes, which must be naturally aligned. Coalescing memory accesses means aligning and organizing these accesses so that multiple threads in a warp can combine their memory requests into the fewest possible transactions. If threads access memory in a coalesced manner, meaning consecutive threads read or write consecutive memory locations, the memory controller can merge these accesses into a single transaction. This is crucial because global memory bandwidth is relatively low compared to on-chip bandwidths, and non-optimal memory accesses can significantly impact performance. If all the threads in a warp can access consecutive memory locations, memory access is fully coalesced. - - The data is read-only - - The same value is used across threads - - The data size is small + To achieve coalesced memory access in HIP, you should: -Memory access patterns and best practices ------------------------------------------ + 1. *Align Data*: Use data types that are naturally aligned and ensure that structures and arrays are aligned properly. + 2. *Optimize Access Patterns*: Arrange memory accesses so that consecutive threads in a warp access consecutive memory locations. For example, if threads access a 2D array, the array and thread block widths should be multiples of the warp size. + 3. *Avoid strided access*: For example array[i * stride] can lead to memory bank conflicts and inefficient access. + 4. *Pad Data*: If necessary, pad data structures to ensure alignment and coalescing. -While you should refer to the :ref:`memory_management`, the following are a few memory -access patterns and best practices: +* **Shared memory**: Avoiding bank conflicts reduces the serialization of memory transactions. -* Global memory: Coalescing reduces memory transactions. -* Shared memory: Avoiding bank conflicts is crucial. -* Texture memory: Spatial locality improves caching. -* Unified memory: Structured access minimizes page migration overhead. + Shared memory is a small, fast memory region inside the CU. Unlike global memory, shared memory accesses do not require coalescing, but they can suffer from bank conflicts, which are another form of inefficient memory access. Shared memory is divided into multiple memory banks (usually 32 banks on modern GPUs). If multiple threads within a warp try to access different addresses that map to the same memory bank, accesses get serialized, leading to poor performance. To optimize shared memory usage, ensure that consecutive threads access different memory banks. Use padding if necessary to avoid conflicts. -When a kernel accesses global memory, the memory transactions typically occur in chunks of 32, 64, or 128 bytes. If threads access memory in a coalesced manner, meaning consecutive threads read or write consecutive memory locations, the memory controller can merge these accesses into a single transaction. Coalesced access primarily applies to global memory, which is the largest but slowest type of memory on a GPU and coalesced access significantly improves performance by reducing memory latency and increasing bandwidth efficiency. +* **Texture memory**: Spatial locality improves caching performance. -To achieve coalesced memory access in HIP, ensure that memory addresses accessed by consecutive threads are aligned. Structure data for coalesced access by storing it in a contiguous manner so that thread[i] can access array[i], and not some random location. Avoid strided access patterns, for example array[i * stride] can lead to memory bank conflicts and inefficient access. If all the threads in a warp can access consecutive memory locations, memory access is fully coalesced. + Texture memory is read-only memory optimized for spatial locality and caching rather than coalescing. Texture memory is cached, unlike standard global memory, and it provides optimized access patterns for 2D and spatially local data. Accessing neighboring values results in cache hits, improving performance. Therefore, instead of worrying about coalescing, optimal memory access patterns involve ensuring that threads access spatially adjacent texture elements, and the memory layout aligns well with the 2D caching mechanism. -Shared memory is a small, fast memory region inside the CU. Unlike global memory, shared memory accesses do not require coalescing, but they can suffer from bank conflicts, which are another form of inefficient memory access. Shared memory is divided into multiple memory banks (usually 32 banks on modern GPUs). If multiple threads within a warp try to access different addresses that map to the same memory bank, accesses get serialized, leading to poor performance. To optimize shared memory usage ensure that consecutive threads access different memory banks. Use padding if necessary to avoid conflicts. +* **Unified memory**: Structured access reduces the overhead of page migrations. -Texture memory is read-only memory optimized for spatial locality and caching rather than coalescing. Texture memory is cached, unlike standard global memory, and it provides optimized access patterns for 2D and spatially local data. Accessing neighboring values results in cache hits, improving performance. Therefore, instead of worrying about coalescing, optimal memory access patterns involve ensuring that threads access spatially adjacent texture elements, and the memory layout aligns well with the 2D caching mechanism. + Unified memory allows the CPU and GPU to share memory seamlessly, but performance depends on access patterns. Unified memory enables automatic page migration between CPU and GPU memory. However, if different threads access different pages, it can lead to expensive page migrations and slow throughput performance. Accessing unified memory in a structured, warp-friendly manner reduces unnecessary page transfers. Ensure threads access memory in a structured, consecutive manner, minimizing page faults. Prefetch data to the GPU before computation by using ``hipMemPrefetchAsync()``. In addition, using small batch transfers as described below, can reduce unexpected page migrations when using unified memory. -Unified memory allows the CPU and GPU to share memory seamlessly, but performance depends on access patterns. Unified memory enables automatic page migration between CPU and GPU memory. However, if different threads access different pages, it can lead to expensive page migrations and slow throughput performance. Accessing unified memory in a structured, warp-friendly manner reduces unnecessary page transfers. Ensure threads access memory in a structured, consecutive manner, minimizing page faults. Prefetch data to the GPU before computation by using ``hipMemPrefetchAsync()``. In addition, using small batch transfers as described below, can reduce unexpected page migrations when using unified memory. +* **Small batch transfers**: Enable pipelining and improve PCIe bandwidth use. -Memory transfers between the host and the device can become a major bottleneck if not optimized. One method is to use small batch memory transfers where data is transferred in smaller chunks instead of a dealing with large datasets to avoid long blocking operations. Small batch transfers offer better PCIe bandwidth utilization over large data transfers. Small batch transfers offer performance improvement by offering reduced latency with small batches that run asynchronously using ``hipMemcpyAsync()`` as described in :ref:`asynchronous_how-to`, pipelining data transfers and kernel execution using separate streams. Finally, using pinned memory with small batch transfers enables faster DMA transfers without CPU involvement, greatly improving memory transfer performance. + Memory transfers between the host and the device can become a major bottleneck if not optimized. One method is to use small batch memory transfers where data is transferred in smaller chunks instead of dealing with large datasets to avoid long blocking operations. Small batch transfers offer better PCIe bandwidth utilization over large data transfers. Small batch transfers offer performance improvement by offering reduced latency with small batches that run asynchronously using ``hipMemcpyAsync()`` as described in :ref:`asynchronous_how-to`, pipelining data transfers and kernel execution using separate streams. Finally, using pinned memory with small batch transfers enables faster DMA transfers without CPU involvement, greatly improving memory transfer performance. Execution model =============== As previously discussed in :ref:`heterogeneous_programming`, HIP programs consist of two distinct scopes: -* The host-side API running on the host processor. There are two APIs available: - - * The HIP runtime API which enables use of the single-source programming - model. - - * The HIP driver API which sits at a lower level and most importantly differs - by removing some facilities provided by the runtime API, most - importantly around kernel launching and argument setting. It is geared - towards implementing abstractions atop, such as the runtime API itself. - Offers two additional pieces of functionality not provided by the Runtime - API: ``hipModule`` and ``hipCtx`` APIs. For further details, check - :doc:`HIP driver API `. - -* The device-side kernels running on GPUs. Both the host and the device-side - APIs have synchronous and asynchronous functions in them. +* The host-side API running on the host processor. +* The device-side kernels running on GPUs. -.. note:: - - The HIP does not present two *separate* APIs link NVIDIA CUDA. HIP only extends - the HIP runtime API with new APIs for ``hipModule`` and ``hipCtx``. +Both the host and the device-side APIs have synchronous and asynchronous functions. Host-side execution ------------------- -The part of the host-side API which deals with device management and their -queries are synchronous. All asynchronous APIs, such as kernel execution, data -movement and potentially data allocation/freeing all happen in the context of -device streams. +The host-side API dealing with device management and their queries are synchronous. +All asynchronous APIs, such as kernel execution, data movement and potentially data +allocation/freeing all happen in the context of device streams, as described in `Managing streams <../how-to/hip_runtime_api/asynchronous.html#managing-streams>`_. Streams are FIFO buffers of commands to execute relating to a given device. -Commands which enqueue tasks on a stream all return promptly and the command is +Operations that enqueue tasks on a stream all return promptly, and the command is executed asynchronously. All side effects of a command on a stream are visible to all subsequent commands on the same stream. Multiple streams may point to the same device and those streams may be fed from multiple concurrent host-side threads. Execution on multiple streams may be concurrent but isn't required to be. -Asynchronous APIs involving a stream all return a stream event which may be +Asynchronous APIs involving a stream all return a stream event, which can be used to synchronize the execution of multiple streams. A user may enqueue a -barrier onto a stream referencing an event. The barrier will block until -the command related to the event does not complete, at which point all -side effects of the command shall be visible to commands following the barrier, -even if those side effects manifest on different devices. +barrier onto a stream referencing an event. The barrier will block activity on the +stream until the operation related to the event completes. After the event completes, all +side effects of the operation will be visible to subsequent commands even if those +side effects manifest on different devices. + +.. figure:: ../data/understand/programming_model/stream-workflow.svg + :alt: Diagram depicting the stream and event workflow, with an example of + multiple streams working together. The diagram shows operations as red + rectangles, and events as white dots. There are three streams labelled + Stream 1, 2, and 3. The streams each have multiple operations and events + that require synchronization between the streams. + + Multiple stream workflow Streams also support executing user-defined functions as callbacks on the host. The stream will not launch subsequent commands until the callback completes. @@ -476,17 +482,8 @@ The stream will not launch subsequent commands until the callback completes. Device-side execution --------------------- -The SIMT programming model behind the HIP device-side execution is a -middle-ground between SMT (Simultaneous Multi-Threading) programming known from -multicore CPUs, and SIMD (Single Instruction, Multiple Data) programming -mostly known from exploiting relevant instruction sets on CPUs (for example -SSE/AVX/Neon). - -Kernel launch -------------- - -Kernels may be launched in multiple ways all with different syntaxes and -intended use-cases. +Kernels may be launched in multiple ways, all with different syntaxes and +intended use cases. * Using the triple-chevron ``<<<...>>>`` operator on a ``__global__`` annotated function. @@ -495,17 +492,44 @@ intended use-cases. .. tip:: - This name by default is a macro expanding to triple-chevron. In cases where + This name, by default, is a macro expanding to the triple-chevron syntax. In cases where language syntax extensions are undesirable, or where launching templated and/or overloaded kernel functions define the ``HIP_TEMPLATE_KERNEL_LAUNCH`` preprocessor macro before including the HIP headers to turn it into a templated function. -* Using the launch APIs supporting the triple-chevron syntax directly. +Asynchronous execution +---------------------- + +Asynchronous operations between the host and the kernel provide a variety of opportunities, +or challenges, for managing synchronization, as described in :ref:`asynchronous_how-to`. +For instance, a basic model would be to launch an asynchronous operation on a kernel +in a stream, create an event to track the operation, continue operations in the host +program, and when the event shows that the asynchronous operation is complete, synchronize the kernel to return the results. + +However, one of the opportunities of asynchronous operation is the pipelining of operations +between launching kernels and transferring memory. In this case, you would be working +with multiple streams running concurrently, or at least overlapping in some regard, +and managing any dependencies between the streams in the host application. +The producer-consumer paradigm can be used to convert a sequential program +into parallel operations to improve performance. This process can employ multiple +streams to kick off asynchronous kernels, provide data to the kernels, perform operations, +and return the results for further processing in the host application. + +These asynchronous activities call for stream management strategies. In the case +of the single stream, the only management would be the stream synchronization +when the work was complete. However, with multiple streams you have +overlapping execution of operations and synchronization becomes more complex, as shown +in the variations of the example in `Programmatic dependent launch and synchronization <../how-to/hip_runtime_api/asynchronous.html#programmatic-dependent-launch-and-synchronization>`_. +You need to manage each stream's activities, evaluate the availability of results, evaluate the critical path of the tasks, allocate resources on the hardware, and manage the execution order. + +Multi-GPU and load balancing +---------------------------- - .. caution:: +For applications requiring additional computational power beyond a single device, +HIP supports utilizing multiple GPUs within a system. Large-scale applications +that need more compute power can use multiple GPUs in the system. This enables +the runtime to distribute workloads across multiple GPUs to balance the load and prevent some GPUs +from being over-utilized while others are idle. - These APIs are intended to be used/generated by tools such as the HIP - compiler itself and not intended towards end-user code. Should you be - writing a tool having to launch device code using HIP, consider using these - over the alternatives. +For more information, see :ref:`multi-device`. From bccb946ec5399f6649640913dab3c48cae17da9d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 13 Mar 2025 00:37:46 +0000 Subject: [PATCH 17/32] Bump rocm-docs-core[api_reference] from 1.17.1 to 1.18.1 in /docs/sphinx Bumps [rocm-docs-core[api_reference]](https://github.com/ROCm/rocm-docs-core) from 1.17.1 to 1.18.1. - [Release notes](https://github.com/ROCm/rocm-docs-core/releases) - [Changelog](https://github.com/ROCm/rocm-docs-core/blob/develop/CHANGELOG.md) - [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.17.1...v1.18.1) --- updated-dependencies: - dependency-name: rocm-docs-core[api_reference] dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- docs/sphinx/requirements.in | 2 +- docs/sphinx/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in index 1d65d55880..07e229101b 100644 --- a/docs/sphinx/requirements.in +++ b/docs/sphinx/requirements.in @@ -1,2 +1,2 @@ -rocm-docs-core[api_reference]==1.17.1 +rocm-docs-core[api_reference]==1.18.1 sphinxcontrib.doxylink diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index 6db9f3e428..4f3afc36dc 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -211,7 +211,7 @@ requests==2.32.3 # via # pygithub # sphinx -rocm-docs-core[api-reference]==1.17.1 +rocm-docs-core[api-reference]==1.18.1 # via -r requirements.in rpds-py==0.22.3 # via From 5d6c5fb1103d30d1211593a572a5108c745d984f Mon Sep 17 00:00:00 2001 From: Adel Johar Date: Mon, 24 Feb 2025 12:51:47 +0100 Subject: [PATCH 18/32] Docs: Add page for Complex Math API --- .wordlist.txt | 1 + docs/index.md | 1 + docs/reference/complex_math_api.rst | 446 ++++++++++++++++++++++++++++ docs/sphinx/_toc.yml.in | 1 + 4 files changed, 449 insertions(+) create mode 100644 docs/reference/complex_math_api.rst diff --git a/.wordlist.txt b/.wordlist.txt index 7bc0f65fa0..1bca54a941 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -36,6 +36,7 @@ deallocate decompositions denormal Dereferencing +DFT dll DirectX EIGEN diff --git a/docs/index.md b/docs/index.md index 9be67a91d3..8f8717c2ad 100644 --- a/docs/index.md +++ b/docs/index.md @@ -42,6 +42,7 @@ The HIP documentation is organized into the following categories: * [HIP runtime API](./reference/hip_runtime_api_reference) * [HIP math API](./reference/math_api) +* [HIP complex math API](./reference/complex_math_api) * [HIP environment variables](./reference/env_variables) * [CUDA to HIP API Function Comparison](./reference/api_syntax) * [List of deprecated APIs](./reference/deprecated_api_list) diff --git a/docs/reference/complex_math_api.rst b/docs/reference/complex_math_api.rst new file mode 100644 index 0000000000..5306efec42 --- /dev/null +++ b/docs/reference/complex_math_api.rst @@ -0,0 +1,446 @@ +.. meta:: + :description: This chapter describes the complex math functions that are accessible in HIP. + :keywords: AMD, ROCm, HIP, CUDA, complex math functions, HIP complex math functions + +.. _complex_math_api_reference: + +******************************************************************************** +HIP complex math API +******************************************************************************** + +HIP provides built-in support for complex number operations through specialized types and functions, +available for both single-precision (float) and double-precision (double) calculations. All complex types +and functions are available on both host and device. + +For any complex number ``z``, the form is: + +.. math:: + + z = x + yi + +where ``x`` is the real part and ``y`` is the imaginary part. + +Complex Number Types +==================== + +A brief overview of the specialized data types used to represent complex numbers in HIP, available +in both single and double precision formats. + +.. list-table:: + :header-rows: 1 + :widths: 40 60 + + * - Type + - Description + + * - ``hipFloatComplex`` + - | Complex number using single-precision (float) values + | (note: ``hipComplex`` is an alias of ``hipFloatComplex``) + + * - ``hipDoubleComplex`` + - Complex number using double-precision (double) values + +Complex Number Functions +======================== + +A comprehensive collection of functions for creating and manipulating complex numbers, organized by +functional categories for easy reference. + +Type Construction +----------------- + +Functions for creating complex number objects and extracting their real and imaginary components. + +.. tab-set:: + + .. tab-item:: Single Precision + + .. list-table:: + :header-rows: 1 + :widths: 40 60 + + * - Function + - Description + + * - | ``hipFloatComplex`` + | ``make_hipFloatComplex(`` + | ``float a,`` + | ``float b`` + | ``)`` + - | Creates a complex number + | (note: ``make_hipComplex`` is an alias of ``make_hipFloatComplex``) + | :math:`z = a + bi` + + * - | ``float`` + | ``hipCrealf(`` + | ``hipFloatComplex z`` + | ``)`` + - | Returns real part of z + | :math:`\Re(z) = x` + + * - | ``float`` + | ``hipCimagf(`` + | ``hipFloatComplex z`` + | ``)`` + - | Returns imaginary part of z + | :math:`\Im(z) = y` + + .. tab-item:: Double Precision + + .. list-table:: + :header-rows: 1 + :widths: 40 60 + + * - Function + - Description + + * - | ``hipDoubleComplex`` + | ``make_hipDoubleComplex(`` + | ``double a,`` + | ``double b`` + | ``)`` + - | Creates a complex number + | :math:`z = a + bi` + + * - | ``double`` + | ``hipCreal(`` + | ``hipDoubleComplex z`` + | ``)`` + - | Returns real part of z + | :math:`\Re(z) = x` + + * - | ``double`` + | ``hipCimag(`` + | ``hipDoubleComplex z`` + | ``)`` + - | Returns imaginary part of z + | :math:`\Im(z) = y` + +Basic Arithmetic +---------------- + +Operations for performing standard arithmetic with complex numbers, including addition, +subtraction, multiplication, division, and fused multiply-add. + +.. tab-set:: + + .. tab-item:: Single Precision + + .. list-table:: + :header-rows: 1 + :widths: 40 60 + + * - Function + - Description + + * - | ``hipFloatComplex`` + | ``hipCaddf(`` + | ``hipFloatComplex p,`` + | ``hipFloatComplex q`` + | ``)`` + - | Addition of two single-precision complex values + | :math:`(a + bi) + (c + di) = (a + c) + (b + d)i` + + * - | ``hipFloatComplex`` + | ``hipCsubf(`` + | ``hipFloatComplex p,`` + | ``hipFloatComplex q`` + | ``)`` + - | Subtraction of two single-precision complex values + | :math:`(a + bi) - (c + di) = (a - c) + (b - d)i` + + * - | ``hipFloatComplex`` + | ``hipCmulf(`` + | ``hipFloatComplex p,`` + | ``hipFloatComplex q`` + | ``)`` + - | Multiplication of two single-precision complex values + | :math:`(a + bi)(c + di) = (ac - bd) + (bc + ad)i` + + * - | ``hipFloatComplex`` + | ``hipCdivf(`` + | ``hipFloatComplex p,`` + | ``hipFloatComplex q`` + | ``)`` + - | Division of two single-precision complex values + | :math:`\frac{a + bi}{c + di} = \frac{(ac + bd) + (bc - ad)i}{c^2 + d^2}` + + * - | ``hipFloatComplex`` + | ``hipCfmaf(`` + | ``hipComplex p,`` + | ``hipComplex q,`` + | ``hipComplex r`` + | ``)`` + - | Fused multiply-add of three single-precision complex values + | :math:`(a + bi)(c + di) + (e + fi)` + + .. tab-item:: Double Precision + + .. list-table:: + :header-rows: 1 + :widths: 40 60 + + * - Function + - Description + + * - | ``hipDoubleComplex`` + | ``hipCadd(`` + | ``hipDoubleComplex p,`` + | ``hipDoubleComplex q`` + | ``)`` + - | Addition of two double-precision complex values + | :math:`(a + bi) + (c + di) = (a + c) + (b + d)i` + + * - | ``hipDoubleComplex`` + | ``hipCsub(`` + | ``hipDoubleComplex p,`` + | ``hipDoubleComplex q`` + | ``)`` + - | Subtraction of two double-precision complex values + | :math:`(a + bi) - (c + di) = (a - c) + (b - d)i` + + * - | ``hipDoubleComplex`` + | ``hipCmul(`` + | ``hipDoubleComplex p,`` + | ``hipDoubleComplex q`` + | ``)`` + - | Multiplication of two double-precision complex values + | :math:`(a + bi)(c + di) = (ac - bd) + (bc + ad)i` + + * - | ``hipDoubleComplex`` + | ``hipCdiv(`` + | ``hipDoubleComplex p,`` + | ``hipDoubleComplex q`` + | ``)`` + - | Division of two double-precision complex values + | :math:`\frac{a + bi}{c + di} = \frac{(ac + bd) + (bc - ad)i}{c^2 + d^2}` + + * - | ``hipDoubleComplex`` + | ``hipCfma(`` + | ``hipDoubleComplex p,`` + | ``hipDoubleComplex q,`` + | ``hipDoubleComplex r`` + | ``)`` + - | Fused multiply-add of three double-precision complex values + | :math:`(a + bi)(c + di) + (e + fi)` + +Complex Operations +------------------ + +Functions for complex-specific calculations, including conjugate determination and magnitude +(absolute value) computation. + +.. tab-set:: + + .. tab-item:: Single Precision + + .. list-table:: + :header-rows: 1 + :widths: 40 60 + + * - Function + - Description + + * - | ``hipFloatComplex`` + | ``hipConjf(`` + | ``hipFloatComplex z`` + | ``)`` + - | Complex conjugate + | :math:`\overline{a + bi} = a - bi` + + * - | ``float`` + | ``hipCabsf(`` + | ``hipFloatComplex z`` + | ``)`` + - | Absolute value (magnitude) + | :math:`|a + bi| = \sqrt{a^2 + b^2}` + + * - | ``float`` + | ``hipCsqabsf(`` + | ``hipFloatComplex z`` + | ``)`` + - | Squared absolute value + | :math:`|a + bi|^2 = a^2 + b^2` + + .. tab-item:: Double Precision + + .. list-table:: + :header-rows: 1 + :widths: 40 60 + + * - Function + - Description + + * - | ``hipDoubleComplex`` + | ``hipConj(`` + | ``hipDoubleComplex z`` + | ``)`` + - | Complex conjugate + | :math:`\overline{a + bi} = a - bi` + + * - | ``double`` + | ``hipCabs(`` + | ``hipDoubleComplex z`` + | ``)`` + - | Absolute value (magnitude) + | :math:`|a + bi| = \sqrt{a^2 + b^2}` + + * - | ``double`` + | ``hipCsqabs(`` + | ``hipDoubleComplex z`` + | ``)`` + - | Squared absolute value + | :math:`|a + bi|^2 = a^2 + b^2` + +Type Conversion +--------------- + +Utility functions for conversion between single-precision and double-precision complex number formats. + +.. list-table:: + :header-rows: 1 + :widths: 40 60 + + * - Function + - Description + + * - | ``hipFloatComplex`` + | ``hipComplexDoubleToFloat(`` + | ``hipDoubleComplex z`` + | ``)`` + - Converts double-precision to single-precision complex + + * - | ``hipDoubleComplex`` + | ``hipComplexFloatToDouble(`` + | ``hipFloatComplex z`` + | ``)`` + - Converts single-precision to double-precision complex + +Example Usage +============= + +The following example demonstrates using complex numbers to compute the Discrete Fourier Transform (DFT) +of a simple signal on the GPU. The DFT converts a signal from the time domain to the frequency domain. +The kernel function ``computeDFT`` shows various HIP complex math operations in action: + +* Creating complex numbers with ``make_hipFloatComplex`` +* Performing complex multiplication with ``hipCmulf`` +* Accumulating complex values with ``hipCaddf`` + +The example also demonstrates proper use of complex number handling on both host and device, including +memory allocation, transfer, and validation of results between CPU and GPU implementations. + +.. code-block:: cpp + + #include + #include + #include + #include + #include + + #define HIP_CHECK(expression) \ + { \ + const hipError_t err = expression; \ + if (err != hipSuccess) { \ + std::cerr << "HIP error: " \ + << hipGetErrorString(err) \ + << " at " << __LINE__ << "\n"; \ + exit(EXIT_FAILURE); \ + } \ + } + + // Kernel to compute DFT + __global__ void computeDFT(const float* input, + hipFloatComplex* output, + const int N) + { + int k = blockIdx.x * blockDim.x + threadIdx.x; + if (k >= N) return; + + hipFloatComplex sum = make_hipFloatComplex(0.0f, 0.0f); + + for (int n = 0; n < N; n++) { + float angle = -2.0f * M_PI * k * n / N; + hipFloatComplex w = make_hipFloatComplex(cosf(angle), sinf(angle)); + hipFloatComplex x = make_hipFloatComplex(input[n], 0.0f); + sum = hipCaddf(sum, hipCmulf(x, w)); + } + + output[k] = sum; + } + + // CPU implementation of DFT for verification + std::vector cpuDFT(const std::vector& input) { + const int N = input.size(); + std::vector result(N); + + for (int k = 0; k < N; k++) { + hipFloatComplex sum = make_hipFloatComplex(0.0f, 0.0f); + for (int n = 0; n < N; n++) { + float angle = -2.0f * M_PI * k * n / N; + hipFloatComplex w = make_hipFloatComplex(cosf(angle), sinf(angle)); + hipFloatComplex x = make_hipFloatComplex(input[n], 0.0f); + sum = hipCaddf(sum, hipCmulf(x, w)); + } + result[k] = sum; + } + return result; + } + + int main() { + const int N = 256; // Signal length + const int blockSize = 256; + + // Generate input signal: sum of two sine waves + std::vector signal(N); + for (int i = 0; i < N; i++) { + float t = static_cast(i) / N; + signal[i] = sinf(2.0f * M_PI * 10.0f * t) + // 10 Hz component + 0.5f * sinf(2.0f * M_PI * 20.0f * t); // 20 Hz component + } + + // Compute reference solution on CPU + std::vector cpu_output = cpuDFT(signal); + + // Allocate device memory + float* d_signal; + hipFloatComplex* d_output; + HIP_CHECK(hipMalloc(&d_signal, N * sizeof(float))); + HIP_CHECK(hipMalloc(&d_output, N * sizeof(hipFloatComplex))); + + // Copy input to device + HIP_CHECK(hipMemcpy(d_signal, signal.data(), N * sizeof(float), + hipMemcpyHostToDevice)); + + // Launch kernel + dim3 grid((N + blockSize - 1) / blockSize); + dim3 block(blockSize); + computeDFT<<>>(d_signal, d_output, N); + HIP_CHECK(hipGetLastError()); + + // Get GPU results + std::vector gpu_output(N); + HIP_CHECK(hipMemcpy(gpu_output.data(), d_output, N * sizeof(hipFloatComplex), + hipMemcpyDeviceToHost)); + + // Verify results + bool passed = true; + const float tolerance = 1e-5f; // Adjust based on precision requirements + + for (int i = 0; i < N; i++) { + float diff_real = std::abs(hipCrealf(gpu_output[i]) - hipCrealf(cpu_output[i])); + float diff_imag = std::abs(hipCimagf(gpu_output[i]) - hipCimagf(cpu_output[i])); + + if (diff_real > tolerance || diff_imag > tolerance) { + passed = false; + break; + } + } + + std::cout << "DFT Verification: " << (passed ? "PASSED" : "FAILED") << "\n"; + + // Cleanup + HIP_CHECK(hipFree(d_signal)); + HIP_CHECK(hipFree(d_output)); + return passed ? 0 : 1; + } diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index dacc58d884..96e686b738 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -107,6 +107,7 @@ subtrees: - file: doxygen/html/annotated - file: doxygen/html/files - file: reference/math_api + - file: reference/complex_math_api - file: reference/env_variables - file: reference/api_syntax - file: reference/deprecated_api_list From f3f3584f8e3e86ba21981a704c7d59bdcc327f6d Mon Sep 17 00:00:00 2001 From: Adel Johar Date: Mon, 24 Mar 2025 17:17:38 +0100 Subject: [PATCH 19/32] Docs: Add page about HIP error codes --- .../how-to/hip_runtime_api/error_handling.rst | 2 + docs/index.md | 1 + docs/reference/error_codes.rst | 1283 +++++++++++++++++ docs/sphinx/_toc.yml.in | 1 + 4 files changed, 1287 insertions(+) create mode 100644 docs/reference/error_codes.rst diff --git a/docs/how-to/hip_runtime_api/error_handling.rst b/docs/how-to/hip_runtime_api/error_handling.rst index 575f9dee81..a400ff97ec 100644 --- a/docs/how-to/hip_runtime_api/error_handling.rst +++ b/docs/how-to/hip_runtime_api/error_handling.rst @@ -37,6 +37,8 @@ Best practices of HIP error handling: For more details on the error handling functions, see :ref:`error handling functions reference page `. +For a list of all error codes, see :ref:`HIP error codes `. + .. _hip_check_macros: HIP check macros diff --git a/docs/index.md b/docs/index.md index 8f8717c2ad..d47962d6fe 100644 --- a/docs/index.md +++ b/docs/index.md @@ -44,6 +44,7 @@ The HIP documentation is organized into the following categories: * [HIP math API](./reference/math_api) * [HIP complex math API](./reference/complex_math_api) * [HIP environment variables](./reference/env_variables) +* [HIP error codes](./reference/error_codes) * [CUDA to HIP API Function Comparison](./reference/api_syntax) * [List of deprecated APIs](./reference/deprecated_api_list) * [Low Precision Floating Point Types](./reference/low_fp_types) diff --git a/docs/reference/error_codes.rst b/docs/reference/error_codes.rst new file mode 100644 index 0000000000..d46127c44c --- /dev/null +++ b/docs/reference/error_codes.rst @@ -0,0 +1,1283 @@ +.. meta:: + :description: HIP error codes reference + :keywords: AMD, HIP, error codes, error, reference + +.. _hip_error_codes: + +******************************************************************************** +HIP error codes +******************************************************************************** + +This page lists all HIP runtime error codes and their descriptions. These error codes are +returned by HIP API functions to indicate various runtime conditions and errors. + +For more details, see :ref:`Error handling functions `. + +Basic Runtime Errors +==================== + +.. list-table:: + :header-rows: 1 + :widths: 30 10 60 + + * - Error Code + - Value + - Description + + * - :term:`hipSuccess` + - ``0`` + - No error + + * - :term:`hipErrorUnknown` + - ``999`` + - Unknown error + + * - :term:`hipErrorNotReady` + - ``600`` + - Device not ready + + * - :term:`hipErrorIllegalState` + - ``401`` + - The operation cannot be performed in the present state + + * - :term:`hipErrorNotSupported` + - ``801`` + - Operation not supported + + * - :term:`hipErrorTbd` + - ``1054`` + - To be determined/implemented + +.. glossary:: + + hipSuccess + + No error. Operation completed successfully. This is returned when a HIP function completes + without any errors and indicates normal execution. + + hipErrorUnknown + + Unknown error. This is a general error code returned when no other error code is applicable or + when the specific error condition cannot be determined. This may indicate an unexpected + internal error in the HIP runtime or driver. + + hipErrorNotReady + + Device not ready. This error occurs when asynchronous operations have not completed. + Common scenarios include: + + * Attempting to access results of an asynchronous operation that is still in progress + * Querying the status of a device that is still processing commands + * Attempting to synchronize with an event that hasn't occurred yet + + hipErrorIllegalState + + The operation cannot be performed in the present state. This error occurs when a valid operation + is attempted at an inappropriate time or when the system is in a state that doesn't allow the + requested action. Common scenarios include: + + * Attempting to modify resources that are in use by an active operation + * Calling functions in an incorrect sequence + * State machine violations in the HIP runtime + * Attempting operations on a device that is in an error state + * Trying to change configurations that can only be set during initialization + * Calling APIs in the wrong order for multi-step operations + + hipErrorNotSupported + + Operation not supported. This error indicates that the requested operation is not supported by the + current hardware, driver, or HIP implementation. + + hipErrorTbd + + To be determined/implemented. This is a placeholder error code for functionality that is planned + but not yet fully implemented. It indicates that: + + * The feature or API may be documented but not fully functional + * The error handling for a particular edge case is not yet defined + * The functionality is under development and will be available in future releases + + If this error is encountered, it generally means the API or feature is not fully supported in the + current version. + +Memory Management Errors +======================== + +.. list-table:: + :header-rows: 1 + :widths: 30 10 60 + + * - Error Code + - Value + - Description + + * - :term:`hipErrorOutOfMemory` + - ``2`` + - Out of memory + + * - :term:`hipErrorInvalidDevicePointer` + - ``17`` + - Invalid device pointer + + * - :term:`hipErrorHostMemoryAlreadyRegistered` + - ``712`` + - Part or all of the requested memory range is already mapped + + * - :term:`hipErrorHostMemoryNotRegistered` + - ``713`` + - Pointer does not correspond to a registered memory region + + * - :term:`hipErrorInvalidMemcpyDirection` + - ``21`` + - Invalid copy direction for memcpy + + * - :term:`hipErrorIllegalAddress` + - ``700`` + - An illegal memory access was encountered + + * - :term:`hipErrorRuntimeMemory` + - ``1052`` + - Runtime memory call returned error + +.. glossary:: + + hipErrorOutOfMemory + + Out of memory. This error occurs when the HIP runtime cannot allocate enough memory to perform the + requested operation. Common scenarios include: + + * Device memory exhaustion during :cpp:func:`hipMalloc()` or similar allocation functions + * Allocating more memory than is available on the device + * Fragmentation of device memory preventing allocation of a contiguous block + * Multiple concurrent allocations exceeding available memory + + hipErrorInvalidDevicePointer + + Invalid device pointer. This error occurs when: + + * Using a host pointer where a device pointer is expected + * Using an unallocated device pointer + * Using a device pointer that has been freed + * Using a device pointer from a different context + + hipErrorHostMemoryAlreadyRegistered + + Part or all of the requested memory range is already mapped. This error occurs when attempting to + register host memory that has already been registered. Common scenarios include: + + * Calling :cpp:func:`hipHostRegister()` on a memory region that was previously registered + * Overlapping memory ranges where part of the new range is already registered + * Multiple registration attempts of the same pointer in different parts of the application + * Attempting to register memory that was allocated with :cpp:func:`hipHostMalloc()` (which is already registered) + + This error is distinct from general allocation errors as it specifically deals with the + page-locking/registration of host memory for faster GPU access. + + hipErrorHostMemoryNotRegistered + + Pointer does not correspond to a registered memory region. This error occurs when operations that + require registered host memory are performed on unregistered memory. Common scenarios include: + + * Calling :cpp:func:`hipHostUnregister()` on a pointer that was not previously registered + * Using :cpp:func:`hipHostGetDevicePointer()` on an unregistered host pointer + * Attempting to use :cpp:func:`hipHostGetFlags()` on an unregistered pointer + * Expecting zero-copy behavior with memory that hasn't been properly registered + + This error is the complement to ``hipErrorHostMemoryAlreadyRegistered`` and indicates that an operation + expected registered memory but received a standard host allocation. + + hipErrorInvalidMemcpyDirection + + Invalid copy direction for memcpy. This error occurs when an invalid direction parameter is specified + for memory copy operations. Valid directions include: + + * ``hipMemcpyHostToHost`` + * ``hipMemcpyHostToDevice`` + * ``hipMemcpyDeviceToHost`` + * ``hipMemcpyDeviceToDevice`` + * ``hipMemcpyDefault`` + + The error typically occurs when: + + * Using an undefined direction value + * Using ``hipMemcpyDeviceToDevice`` when copying between incompatible devices + * Using a direction that doesn't match the actual source and destination pointer types + + hipErrorIllegalAddress + + An illegal memory access was encountered. This error indicates that a memory access violation occurred + during kernel execution. Common causes include: + + * Dereferencing a null pointer in device code + * Out-of-bounds access to an array or buffer + * Using an unallocated memory address + * Accessing memory after it has been freed + * Misaligned memory access for types requiring specific alignment + * Writing to read-only memory + * Race conditions in multi-threaded kernels + + This error typically terminates the kernel execution and may provide additional debugging information + when running with GPU debugging tools enabled. + + hipErrorRuntimeMemory + + Runtime memory call returned error. This is a general error indicating that a memory management operation + within the HIP runtime has failed. Common scenarios include: + + * Internal memory allocation failures within the HIP runtime + * Memory corruption affecting the runtime's internal data structures + * System-wide memory pressure affecting runtime operations + * Resource limitations preventing memory operations + * Driver-level memory management errors bubbling up to the application + + This error differs from ``hipErrorOutOfMemory`` in that it relates to memory operations internal to the HIP + runtime rather than explicit application requests for memory allocation. + +Device and Context Errors +========================= + +.. list-table:: + :header-rows: 1 + :widths: 30 10 60 + + * - Error Code + - Value + - Description + + * - :term:`hipErrorNoDevice` + - ``100`` + - No ROCm-capable device is detected + + * - :term:`hipErrorInvalidDevice` + - ``101`` + - Invalid device ordinal + + * - :term:`hipErrorInvalidContext` + - ``201`` + - Invalid device context + + * - :term:`hipErrorContextAlreadyCurrent` + - ``202`` + - Context is already current context + + * - :term:`hipErrorContextAlreadyInUse` + - ``216`` + - Exclusive-thread device already in use by a different thread + + * - :term:`hipErrorContextIsDestroyed` + - ``709`` + - Context is destroyed + + * - :term:`hipErrorInvalidHandle` + - ``400`` + - Invalid resource handle + + * - :term:`hipErrorSetOnActiveProcess` + - ``708`` + - Cannot set while device is active in this process + + * - :term:`hipErrorDeinitialized` + - ``4`` + - Driver shutting down + + * - :term:`hipErrorNotInitialized` + - ``3`` + - Initialization error + + * - :term:`hipErrorInsufficientDriver` + - ``35`` + - Driver version is insufficient for runtime version + +.. glossary:: + + hipErrorNoDevice + + No ROCm-capable device is detected. This error occurs when the system does not have any compatible GPU devices + that support the HIP runtime. Common scenarios include: + + * No physical GPU is installed in the system + * Installed GPUs are not supported by the current HIP/ROCm version + * GPU drivers are missing, outdated, or corrupted + * GPU hardware failure or disconnection + * System configuration prevents GPU detection (e.g., BIOS settings, virtualization limitations) + * On Linux with ``HIP_PLATFORM=amd``, insufficient user permissions - the user must belong to both the ``render`` and ``video`` groups + + hipErrorInvalidDevice + + Invalid device ordinal. This error occurs when a function is called with a device index that doesn't correspond to + a valid device. Common scenarios include: + + * Using a device index greater than or equal to the number of available devices + * Using a negative device index + * Using a device that has been removed or disabled + * Attempting to access a device after system configuration changes + + Unlike ``hipErrorNoDevice`` which indicates no devices are available at all, this error occurs when trying to access + a specific invalid device index while other valid devices might still be present. + + hipErrorInvalidContext + + Invalid device context. This error occurs when an operation is attempted with an invalid or destroyed context. + Common scenarios include: + + * Using a context after calling :cpp:func:`hipCtxDestroy()` on it + * Context corruption due to previous errors + * Using a context associated with a device that has been reset + * Mixing contexts improperly between different HIP API calls + * Context handle that was never properly created or initialized + * Using a context from a different process or thread incorrectly + + Context errors often indicate improper resource management in the application or incorrect context handling + in multi-GPU or multi-threaded applications. + + hipErrorContextAlreadyCurrent + + Context is already current context. This error occurs when attempting to make a context current when it is already + the current context for the calling thread. + + hipErrorContextAlreadyInUse + + Exclusive-thread device already in use by a different thread. This error occurs when attempting to access a device or + context that has been allocated in exclusive thread mode from a thread other than the one that created it. + + hipErrorContextIsDestroyed + + Context is destroyed. This error occurs when attempting to use a context that has been previously destroyed. + + hipErrorInvalidHandle + + Invalid resource handle. This error occurs when an invalid handle is provided to a HIP API function. Common scenarios + include using handles that have been destroyed or were never properly initialized. + + hipErrorSetOnActiveProcess + + Cannot set while device is active in this process. This error occurs when attempting to change settings + that cannot be modified while the device is active. + + hipErrorDeinitialized + + Driver shutting down. This error occurs when attempting to use HIP functionality when the driver is in the + process of shutting down or has been deinitialized. Common scenarios include: + + * Using HIP functions after calling :cpp:func:`hipDeviceReset()` + * System is in the process of shutdown or reboot + * Driver crash or unexpected termination + * Another process has triggered driver reset + + hipErrorNotInitialized + + Initialization error. This occurs when attempting to use HIP functionality before the runtime has been + properly initialized. Common scenarios include: + + * Calling HIP API functions before calling :cpp:func:`hipInit()` + * Driver or runtime initialization failure + * System configuration issues preventing proper initialization of the HIP runtime + * Hardware initialization problems + + hipErrorInsufficientDriver + + Driver version is insufficient for runtime version. This error occurs when the installed GPU driver is too + old to support the current HIP runtime version. This version mismatch can cause compatibility issues. + Common scenarios include: + + * Using a newer HIP SDK with older driver installations + * System updates that upgraded the HIP runtime but not the GPU drivers + * Custom build environments with mismatched components + * Partial upgrades of the ROCm stack + +Kernel and Launch Errors +======================== + +.. list-table:: + :header-rows: 1 + :widths: 30 10 60 + + * - Error Code + - Value + - Description + + * - :term:`hipErrorInvalidDeviceFunction` + - ``98`` + - Invalid device function + + * - :term:`hipErrorInvalidConfiguration` + - ``9`` + - Invalid configuration argument + + * - :term:`hipErrorInvalidSymbol` + - ``13`` + - Invalid device symbol + + * - :term:`hipErrorMissingConfiguration` + - ``52`` + - ``__global__`` function call is not configured + + * - :term:`hipErrorNoBinaryForGpu` + - ``209`` + - No kernel image is available for execution on the device + + * - :term:`hipErrorInvalidKernelFile` + - ``218`` + - Invalid kernel file + + * - :term:`hipErrorInvalidImage` + - ``200`` + - Device kernel image is invalid + + * - :term:`hipErrorLaunchFailure` + - ``719`` + - Unspecified launch failure + + * - :term:`hipErrorLaunchTimeOut` + - ``702`` + - The launch timed out and was terminated + + * - :term:`hipErrorLaunchOutOfResources` + - ``701`` + - Too many resources requested for launch + + * - :term:`hipErrorCooperativeLaunchTooLarge` + - ``720`` + - Too many blocks in cooperative launch + + * - :term:`hipErrorPriorLaunchFailure` + - ``53`` + - Unspecified launch failure in prior launch + +.. glossary:: + + hipErrorInvalidDeviceFunction + + Invalid device function. This error occurs when attempting to use a function that is not a valid device + function or is not available for the current device. Common scenarios include: + + * Code compiled for a specific GPU architecture (using ``--offload-arch``) but executed on an different/incompatible GPU + + hipErrorInvalidConfiguration + + Invalid configuration argument. This error occurs when the configuration specified for a kernel launch + or other configurable operation contains invalid parameters. Common scenarios include: + + * Block dimensions exceeding hardware limits (too many threads per block) + * Grid dimensions that are invalid (zero size or exceeding limits) + * Invalid shared memory configuration + * Incompatible combination of launch parameters + * Block dimensions that don't match kernel requirements + * Attempting to use more resources per block than available on the device + + This error typically requires adjusting kernel launch parameters to stay within the limits of the target + device. Device properties and specific hardware constraints can be queried using :cpp:func:`hipGetDeviceProperties()`. + + hipErrorInvalidSymbol + + Invalid device symbol. This error occurs when a referenced symbol (variable or function) cannot be found + or is improperly specified. Common scenarios include: + + * Referencing a symbol that doesn't exist in the compiled kernel + * Symbol name typos or case mismatches + * Attempting to access a host symbol as if it were a device symbol + * Symbol not properly decorated with ``__device__`` or other required attributes + * Symbol not visible due to scope/namespace issues + + hipErrorMissingConfiguration + + ``__global__`` function call is not configured. This error occurs when a kernel launch is attempted + without proper configuration. Common scenarios include: + + * Calling a kernel without specifying execution configuration (grid and block dimensions) + * Invalid or incomplete kernel configuration + * Calling a ``__global__`` function directly as if it were a regular CPU function + * Using a function pointer to a ``__global__`` function incorrectly + + This error is specific to improper kernel invocation syntax and is different from general configuration + errors (``hipErrorInvalidConfiguration``) which relate to the values provided in a properly formed + launch configuration. + + hipErrorNoBinaryForGpu + + No kernel image is available for execution on the device. This error occurs when attempting to run a + kernel on a device for which no compatible compiled binary exists. Common scenarios include: + + * Attempting to run code compiled for a different GPU architecture + * Missing or corrupted kernel binary for the target device + * Kernel was compiled without support for the target device architecture + * Using pre-compiled kernels that don't support the installed hardware + * JIT compilation failure during runtime + + hipErrorInvalidKernelFile + + Invalid kernel file. This error occurs when the kernel file or module being loaded is corrupted or in + an invalid format. + + hipErrorInvalidImage + + Device kernel image is invalid. This error occurs when the device code image is corrupted or in an + unsupported format. + + hipErrorLaunchFailure + + Unspecified launch failure. This is a general error that occurs when a kernel launch fails. + Common causes include: + + * Mismatch between block size configuration and block size specified in launch bounds parameter + * Invalid memory access in kernel + * Kernel execution timeout + * Hardware-specific failures + + hipErrorLaunchTimeOut + + The launch timed out and was terminated. This error occurs when a kernel execution exceeds the + system's watchdog timeout limit. Common scenarios include: + + * Infinite loops in kernel code + * Extremely long-running computations exceeding system limits + * Deadlocks in kernel execution + * Complex kernels that legitimately need more time than the watchdog allows + * Hardware or driver issues preventing normal kernel termination + + The GPU's watchdog timer is a safety mechanism to prevent a hanging kernel from making the system + unresponsive. + + hipErrorLaunchOutOfResources + + Too many resources requested for launch. This occurs when kernel resource requirements exceed + device limits, such as: + + * Exceeding maximum threads per block + * Exceeding maximum shared memory per block + * Exceeding maximum register count per thread + * Insufficient hardware resources for parallel execution + + hipErrorCooperativeLaunchTooLarge + + Too many blocks in cooperative launch. This error occurs when a cooperative kernel launch requests + more thread blocks than the device can support for cooperative groups functionality. + Common scenarios include: + + * Launching a cooperative kernel with grid dimensions that exceed hardware limits + * Requesting more resources than available for synchronization across thread blocks + * Using cooperative groups on hardware with limited support + * Not accounting for cooperative launch limitations in kernel configuration + + Cooperative kernels allow thread blocks to synchronize with each other, but this requires special + hardware support with specific limitations on the maximum number of blocks that can participate + in synchronization operations. + + hipErrorPriorLaunchFailure + + Unspecified launch failure in prior launch. This error indicates that a previous kernel launch failed + and affected the current HIP context state. Common scenarios include: + + * Launching a new kernel after a previous kernel crashed without resetting the device + * Context contamination from previous failed operations + * Resource leaks from previous launches affecting current operations + * Attempting to use results from a previous failed kernel execution + + When this error occurs, it may be necessary to reset the device or create a new context to continue + normal operation. Additional debugging of the previous failed launch may be required to identify + the root cause. + +Stream Capture Errors +===================== + +.. list-table:: + :header-rows: 1 + :widths: 30 10 60 + + * - Error Code + - Value + - Description + + * - :term:`hipErrorStreamCaptureUnsupported` + - ``900`` + - Operation not permitted when stream is capturing + + * - :term:`hipErrorStreamCaptureInvalidated` + - ``901`` + - Operation failed due to a previous error during capture + + * - :term:`hipErrorStreamCaptureMerge` + - ``902`` + - Operation would result in a merge of separate capture sequences + + * - :term:`hipErrorStreamCaptureUnmatched` + - ``903`` + - Capture was not ended in the same stream as it began + + * - :term:`hipErrorStreamCaptureUnjoined` + - ``904`` + - Capturing stream has unjoined work + + * - :term:`hipErrorStreamCaptureIsolation` + - ``905`` + - Dependency created on uncaptured work in another stream + + * - :term:`hipErrorStreamCaptureImplicit` + - ``906`` + - Operation would make the legacy stream depend on a capturing blocking stream + + * - :term:`hipErrorStreamCaptureWrongThread` + - ``908`` + - Attempt to terminate a thread-local capture sequence from another thread + + * - :term:`hipErrorCapturedEvent` + - ``907`` + - Operation not permitted on an event last recorded in a capturing stream + +.. glossary:: + + hipErrorStreamCaptureUnsupported + + Operation not permitted when stream is capturing. This error occurs when attempting to perform an + operation that is incompatible with stream capture mode. Common scenarios include: + + * Calling synchronization functions like :cpp:func:`hipDeviceSynchronize()` during capture + * Using operations that implicitly synchronize during stream capture + * Attempting to use features that cannot be captured as part of a graph + * Trying to perform operations on different devices during capture + * Using driver APIs that are incompatible with the stream capture mechanism + + Stream capture is used to record operations for later replay as a graph. Certain operations that + affect global state or rely on host-device synchronization cannot be properly captured in this + execution model. + + hipErrorStreamCaptureInvalidated + + Operation failed due to a previous error during capture. This error occurs when a stream capture + has been invalidated by a prior error but capture operations are still being attempted. + Common scenarios include: + + * Continuing to add operations to a stream after a capture-invalidating error + * Not checking return codes from previous capture operations + * Attempting to end a capture after invalidation + * System or resource conditions changing during capture + + Once a stream capture has been invalidated, the entire capture sequence should be aborted and + restarted from the beginning after resolving the cause of the initial failure. + + hipErrorStreamCaptureMerge + + Operation would result in a merge of separate capture sequences. This error occurs when an operation + would cause independent capture sequences to merge, which is not supported. Common scenarios include: + + * A stream that is being captured interacting with another capturing stream + * Operations creating implicit dependencies between separate capture sequences + * Using events or other synchronization primitives that would link separate captures + * Resource sharing between different capture sequences + + Stream captures must remain independent of each other to be converted into separate executable graphs. + Operations that would create dependencies between separate captures are not allowed. + + hipErrorStreamCaptureUnmatched + + Capture was not ended in the same stream as it began. This error occurs when trying to end a stream + capture in a different stream than the one where it was started. Common scenarios include: + + * Calling :cpp:func:`hipStreamEndCapture()` on a different stream than :cpp:func:`hipStreamBeginCapture()` + * Confusing stream handles in multi-stream applications + * Not properly tracking which streams have active captures + * Programming errors in capture sequence management + + Stream captures must begin and end in the same stream to maintain the integrity of the captured + operation sequence. The same stream handle must be used for beginning and ending a capture sequence. + + hipErrorStreamCaptureUnjoined + + Capturing stream has unjoined work. This error occurs when attempting to end a stream capture + when there are still pending operations from other streams that have not been joined back to + the capturing stream. Common scenarios include: + + * Forgetting to properly join forked work before ending capture + * Missing :cpp:func:`hipEventRecord()` / :cpp:func:`hipStreamWaitEvent()` pairs for joined streams + * Complex stream dependencies that are not fully resolved at capture end + * Attempting to end a capture before all child operations complete + + When a stream capture forks work to other streams, those operations must be explicitly joined + back to the capturing stream before the capture can be ended. This ensures that all dependencies + are properly represented in the resulting graph. + + hipErrorStreamCaptureIsolation + + Dependency created on uncaptured work in another stream. This error occurs when a capturing stream + becomes dependent on operations in a non-capturing stream. Common scenarios include: + + * A capturing stream waiting on an event recorded in a non-capturing stream + * Creating dependencies on the default stream or other streams outside the capture + * Using synchronization primitives that create implicit dependencies + * Operations that depend on host-side or uncaptured GPU work + + Stream capture requires that all dependencies be explicitly captured as part of the graph. Operations + that would make the captured sequence dependent on work outside the capture cannot be represented + in the resulting graph and are therefore not allowed. + + hipErrorStreamCaptureImplicit + + Operation would make the legacy stream depend on a capturing blocking stream. This error occurs when + an operation would create a dependency from the default (legacy/null) stream to a stream that is + being captured in blocking mode. Common scenarios include: + + * Using the default stream during capture in ways that would create dependencies + * Operations that would cause implicit synchronization with the null stream + * Mixing legacy stream synchronization behavior with stream capture + * Not properly managing stream relationships in applications using both explicit streams and the + default stream + + This error is related to the implicit synchronization behavior of the default stream in HIP, + which can conflict with the explicit dependency tracking needed for stream capture. + + hipErrorStreamCaptureWrongThread + + Attempt to terminate a thread-local capture sequence from another thread. This error occurs when + a thread tries to end a stream capture that was begun by a different thread when using + thread-local capture mode. Common scenarios include: + + * Multi-threaded applications incorrectly managing stream capture + * Attempting to end a capture from a different thread than the one that started it + * Thread pool or worker thread designs that don't properly track capture ownership + * Misunderstanding the thread locality requirements of certain capture modes + + When using ``hipStreamCaptureModeThreadLocal``, stream captures are associated with the specific + thread that started them and can only be ended by that same thread. + + hipErrorCapturedEvent + + Operation not permitted on an event last recorded in a capturing stream. This error occurs + when attempting to perform operations on an event that was last recorded in a stream that + is being captured. Common scenarios include: + + * Calling :cpp:func:`hipEventQuery()` or :cpp:func:`hipEventSynchronize()` on an event recorded during capture + * Using events for host synchronization that are part of a stream capture + * Attempting to reuse events across capturing and non-capturing contexts + * Mixing event usage between graph capture and immediate execution modes + + Events that are part of a stream capture sequence are handled differently than regular events + and cannot be used for host-side synchronization until the capture is complete and the graph + is executed. + +Profiler Errors +=============== + +.. warning:: + + The HIP Profiler Control APIs (:cpp:func:`hipProfilerStart()`, :cpp:func:`hipProfilerStop()`) are deprecated. + It is recommended to use the ROCm profiling tools such as rocprof, roctracer, or AMD Radeon GPU Profiler + for performance analysis instead. + +.. list-table:: + :header-rows: 1 + :widths: 30 10 60 + + * - Error Code + - Value + - Description + + * - :term:`hipErrorProfilerDisabled` + - ``5`` + - Profiler disabled while using external profiling tool + + * - :term:`hipErrorProfilerNotInitialized` + - ``6`` + - Profiler is not initialized + + * - :term:`hipErrorProfilerAlreadyStarted` + - ``7`` + - Profiler already started + + * - :term:`hipErrorProfilerAlreadyStopped` + - ``8`` + - Profiler already stopped + +.. glossary:: + + hipErrorProfilerDisabled + + Profiler disabled while using external profiling tool. This error occurs when attempting to use + the built-in HIP profiling functionality while an external profiling tool has taken control of + the profiling interface. Common scenarios include: + + * Using :cpp:func:`hipProfilerStart()` / :cpp:func:`hipProfilerStop()` while running under tools like rocprof + or AMD Radeon GPU Profiler + * Conflicting profiling requests from different parts of an application + * Attempting to use the HIP profiling API when profiling has been disabled at the driver level + * Environment configurations that disable internal profiling in favor of external tools + + When external performance analysis tools are in use, they typically take exclusive control of + the profiling interface, preventing the application from using the built-in profiling functions. + + hipErrorProfilerNotInitialized + + Profiler is not initialized. This error occurs when attempting to use profiling functions before the + profiler has been properly initialized. Common scenarios include: + + * Calling :cpp:func:`hipProfilerStop()` without first calling :cpp:func:`hipProfilerStart()` + * Using profiling functions before the HIP runtime has fully initialized + * Configuration issues preventing proper profiler initialization + * Missing required profiler components or drivers + + The HIP profiler requires proper initialization before it can collect performance data. The + :cpp:func:`hipProfilerStart()` function must be called successfully before using other profiling functions + or attempting to collect profile data. + + hipErrorProfilerAlreadyStarted + + Profiler already started. This error occurs when attempting to start the HIP profiler when it + has already been started. Common scenarios include: + + * Multiple calls to :cpp:func:`hipProfilerStart()` without intervening :cpp:func:`hipProfilerStop()` + * Attempting to restart profiling in different parts of code without coordination + * Nested profiling sections that don't properly track profiler state + * Mismanagement of profiler state in complex applications + + The HIP profiler can only be started once and must be stopped before it can be started again. + This error is informational and indicates that the profiler is already in the desired active + state. + + hipErrorProfilerAlreadyStopped + + Profiler already stopped. This error occurs when attempting to stop the HIP profiler when it is + not currently running. Common scenarios include: + + * Calling :cpp:func:`hipProfilerStop()` multiple times without intervening :cpp:func:`hipProfilerStart()` + * Mismanagement of profiler state in code with multiple profiling sections + * Attempting to stop profiling in error handling paths when it wasn't started + * Improper profiler state tracking in complex applications + + The HIP profiler must be in an active state before it can be stopped. This error is informational + and indicates that the profiler is already in the desired inactive state. + +Resource Mapping Errors +======================= + +.. list-table:: + :header-rows: 1 + :widths: 30 10 60 + + * - Error Code + - Value + - Description + + * - :term:`hipErrorMapFailed` + - ``205`` + - Mapping of buffer object failed + + * - :term:`hipErrorUnmapFailed` + - ``206`` + - Unmapping of buffer object failed + + * - :term:`hipErrorArrayIsMapped` + - ``207`` + - Array is mapped + + * - :term:`hipErrorAlreadyMapped` + - ``208`` + - Resource already mapped + + * - :term:`hipErrorNotMapped` + - ``211`` + - Resource not mapped + + * - :term:`hipErrorNotMappedAsArray` + - ``212`` + - Resource not mapped as array + + * - :term:`hipErrorNotMappedAsPointer` + - ``213`` + - Resource not mapped as pointer + +.. glossary:: + + hipErrorMapFailed + + Mapping of buffer object failed. This error occurs when the system fails to map device memory to + host-accessible memory space. Common scenarios include: + + * Insufficient system resources for mapping + * Attempting to map too much memory simultaneously + * Mapping memory that is in an invalid state (e.g., already mapped or in use) + * Trying to map memory with incompatible access flags or properties + * System-level memory mapping constraints or limitations + * Attempting to map special memory types that don't support mapping + * Memory pressure affecting the operating system's ability to establish mappings + + This error typically occurs with functions like :cpp:func:`hipHostRegister()`, :cpp:func:`hipGLMapBufferObject()`, + or similar functions that attempt to make device memory accessible to the host through memory + mapping mechanisms. + + hipErrorUnmapFailed + + Unmapping of buffer object failed. This error occurs when the system fails to unmap previously + mapped memory. Common scenarios include: + + * Attempting to unmap memory that is not currently mapped + * Resources being in use by an active operation + * System or driver issues affecting memory management + * Invalid handle or pointer provided to unmap function + * Corrupted mapping state due to application errors + * Operating system resource constraints or failures + + This error is the counterpart to ``hipErrorMapFailed`` and occurs during cleanup operations when + releasing mappings between host and device memory spaces. It may indicate resource leaks or + state inconsistencies if not properly handled. + + hipErrorArrayIsMapped + + Array is mapped. This error occurs when attempting an operation that is not permitted on a + mapped array or buffer. Common scenarios include: + + * Trying to free or modify a mapped array + * Performing certain operations that require exclusive access to mapped resources + * Attempting to re-map an already mapped array + * Using mapped arrays in ways that conflict with their current mapped state + * API calls that are incompatible with the current mapping state + + Arrays or buffers that are currently mapped to host memory have certain restrictions on the + operations that can be performed on them. They must be unmapped before certain operations + are allowed. + + hipErrorAlreadyMapped + + Resource already mapped. This error occurs when attempting to map a resource that is already + in a mapped state. Common scenarios include: + + * Calling mapping functions multiple times on the same resource + * Improper tracking of resource mapping state in complex applications + * Race conditions in multi-threaded applications accessing the same resources + * Attempting to map a resource with different flags when it's already mapped + + This error is similar to ``hipErrorArrayIsMapped`` but is more general and can apply to various + mappable resources, not just arrays. Resources must be unmapped before they can be mapped + again, possibly with different properties. + + hipErrorNotMapped + + Resource not mapped. This error occurs when attempting to perform an operation that requires + a resource to be in a mapped state, but the resource is not currently mapped. + Common scenarios include: + + * Trying to unmap a resource that is not mapped + * Attempting to access host pointers for unmapped resources + * Using mapping-dependent functions on unmapped resources + * Mismanaging mapping state in complex applications + * Attempting to use mapping-specific features with resources that don't support mapping + + This error indicates that a resource must be explicitly mapped before certain operations + can be performed on it. + + hipErrorNotMappedAsArray + + Resource not mapped as array. This error occurs when attempting to use a mapped resource + as an array when it was not mapped with the appropriate array mapping type. Common scenarios include: + + * Attempting to use a resource as an array when it was mapped with a different mapping type + * Using :cpp:func:`hipArrayGetInfo()` or similar functions on resources not mapped as arrays + * Type confusion in complex applications using multiple mapping types + * Mismatched mapping and usage patterns for shared resources + + Different mapping types provide access to resources in different ways, and operations specific + to one mapping type cannot be used with resources mapped using a different type. This error + specifically indicates that an array-specific operation was attempted on a resource that was + not mapped as an array. + + hipErrorNotMappedAsPointer + + Resource not mapped as pointer. This error occurs when attempting to use a mapped resource as + a pointer when it was not mapped with the appropriate pointer mapping type. Common scenarios include: + + * Attempting to use a resource as a pointer when it was mapped with a different mapping type + * Trying to perform pointer arithmetic or pointer-based access on inappropriately mapped resources + * Type confusion in complex applications using multiple mapping types + * Mismatched mapping and usage patterns for shared resources + + This error is complementary to ``hipErrorNotMappedAsArray`` and indicates that a pointer-specific + operation was attempted on a resource that was not mapped as a pointer. Resources must be mapped + with the appropriate mapping type for the operations that will be performed on them. + +Peer Access Errors +================== + +.. list-table:: + :header-rows: 1 + :widths: 30 10 60 + + * - Error Code + - Value + - Description + + * - :term:`hipErrorPeerAccessUnsupported` + - ``217`` + - Peer access is not supported between these two devices + + * - :term:`hipErrorPeerAccessAlreadyEnabled` + - ``704`` + - Peer access is already enabled + + * - :term:`hipErrorPeerAccessNotEnabled` + - ``705`` + - Peer access has not been enabled + +.. glossary:: + + hipErrorPeerAccessUnsupported + + Peer access is not supported between these two devices. This error occurs when attempting to enable peer + access between devices that cannot physically support direct access to each other's memory. + Common scenarios include: + + * Devices connected to different PCIe root complexes without required hardware support + * Different types or generations of GPUs that are incompatible for peer access + * System configurations (BIOS, chipset) that don't allow peer-to-peer transfers + * Virtualized environments that restrict direct hardware access + * Attempting peer access on systems where the hardware interconnect doesn't support it + + This error indicates a hardware or system limitation, not an application error. To work around it, + use regular host-mediated memory transfers instead of direct peer access. Device compatibility should + be verified with :cpp:func:`hipDeviceCanAccessPeer()` before enabling peer access. + + hipErrorPeerAccessAlreadyEnabled + + Peer access is already enabled. This error occurs when attempting to enable peer access between two + devices when that access has already been enabled. Common scenarios include: + + * Multiple calls to :cpp:func:`hipDeviceEnablePeerAccess()` for the same device pair + * Enabling peer access in different parts of code without tracking the current state + * Attempting to re-enable peer access after a context change without checking status + + This error is informational and typically doesn't indicate a problem that needs to be fixed, + but rather that the requested state is already in effect. + + hipErrorPeerAccessNotEnabled + + Peer access has not been enabled. This error occurs when operations requiring peer access between + devices are attempted without first enabling that access. Common scenarios include: + + * Attempting peer-to-peer memory copies without calling :cpp:func:`hipDeviceEnablePeerAccess()` + * Kernel launches that access memory on peer devices without proper access rights + * Accessing peer memory after peer access has been disabled + + To fix this error, call :cpp:func:`hipDeviceEnablePeerAccess()` before attempting operations that require direct + access between peer devices. Not all device combinations support peer access. Compatibility can be + determined with :cpp:func:`hipDeviceCanAccessPeer()`. + +System and File Errors +====================== + +.. list-table:: + :header-rows: 1 + :widths: 30 10 60 + + * - Error Code + - Value + - Description + + * - :term:`hipErrorFileNotFound` + - ``301`` + - File not found + + * - :term:`hipErrorSharedObjectSymbolNotFound` + - ``302`` + - Shared object symbol not found + + * - :term:`hipErrorSharedObjectInitFailed` + - ``303`` + - Shared object initialization failed + + * - :term:`hipErrorOperatingSystem` + - ``304`` + - OS call failed or operation not supported on this OS + + * - :term:`hipErrorNotFound` + - ``500`` + - Named symbol not found + + * - :term:`hipErrorRuntimeOther` + - ``1053`` + - Runtime call other than memory returned error + +.. glossary:: + + hipErrorFileNotFound + + File not found. This error occurs when HIP attempts to load a file that doesn't exist in the + specified location. Common scenarios include: + + * Missing kernel source or binary files + * Incorrect file paths provided to API functions + * Missing shared libraries or dependencies + * Files deleted or moved after initial configuration + * Permission issues preventing file access + + This error typically occurs with operations like loading external kernels, modules, or shared + libraries required by HIP applications. + + hipErrorSharedObjectSymbolNotFound + + Shared object symbol not found. This error occurs when attempting to access a symbol in a shared + library or module that doesn't exist or isn't exported. Common scenarios include: + + * Misspelled symbol names + * Using symbols that exist in the source code but weren't exported in the compiled library + * Versioning mismatches between headers and implementation + * Mangled C++ symbol names not properly accounted for + * Library compiled with different visibility settings than expected + * Using a function or variable name that exists but is in a different namespace + + This error is commonly encountered when using :cpp:func:`hipModuleGetFunction()` or similar functions to obtain + handles to functions in dynamically loaded modules. + + hipErrorSharedObjectInitFailed + + Shared object initialization failed. This error occurs when a shared library or module fails during + its initialization routine. Common scenarios include: + + * Dependencies of the shared object are missing + * Incompatible library versions + * Library initialization code encountering errors + * Resource allocation failures during initialization + * Incompatible compilation settings between application and shared object + * Issues with static constructors in C++ libraries + + This error indicates that while the shared object was found and could be loaded, something prevented + its proper initialization, making its functions and resources unavailable for use. + + hipErrorOperatingSystem + + OS call failed or operation not supported on this OS. This error indicates a system-level failure + outside of the HIP runtime's direct control. Common scenarios include: + + * Insufficient permissions for requested operations + * OS resource limits reached (file descriptors, memory limits, etc.) + * System calls returning failure codes + * Attempting operations not supported by the current OS or OS version + * Driver or hardware interactions failing at the OS level + * File system errors or permission issues + + This is a general error that can occur when HIP interacts with the operating system and encounters + problems that prevent successful completion of the requested operation. + + hipErrorNotFound + + Named symbol not found. This error is returned when a requested named entity (such as a symbol, + texture, surface, etc.) cannot be found. Common scenarios include: + + * Referencing a kernel function that doesn't exist in the module + * Looking up a texture that hasn't been bound or created + * Searching for a device with specific properties that no installed device has + * Referencing a stream or event that has been destroyed + * Using a name for a resource that was never created + * Typos in symbol names + + This error is similar to ``hipErrorSharedObjectSymbolNotFound`` but is more general and applies to + various named entities beyond just symbols in shared objects. + + hipErrorRuntimeOther + + Runtime call other than memory returned error. This is a general error code for failures in the + HIP runtime that don't fit into other more specific categories. Common scenarios include: + + * Internal runtime function failures + * Unexpected conditions encountered during HIP API execution + * Driver-level errors not covered by more specific error codes + * Hardware interaction issues + * State inconsistencies within the runtime + + This is a catch-all error that may require looking at system logs or using additional + debugging tools to identify the root cause. + +Graphics Context Errors +======================= + +.. list-table:: + :header-rows: 1 + :widths: 30 10 60 + + * - Error Code + - Value + - Description + + * - :term:`hipErrorInvalidGraphicsContext` + - ``219`` + - Invalid OpenGL or DirectX context + + * - :term:`hipErrorGraphExecUpdateFailure` + - ``910`` + - | The graph update was not performed because it included changes which violated + | constraints specific to instantiated graph update + +.. glossary:: + + hipErrorInvalidGraphicsContext + + Invalid OpenGL or DirectX context. This error occurs when attempting to perform interoperability + operations with an invalid or incompatible graphics context. + + hipErrorGraphExecUpdateFailure + + The graph update was not performed because it included changes which violated constraints specific to + instantiated graph update. This error occurs when attempting to update an already instantiated + graph with changes that are not allowed. + +Hardware Errors +=============== + +.. list-table:: + :header-rows: 1 + :widths: 30 10 60 + + * - Error Code + - Value + - Description + + * - :term:`hipErrorECCNotCorrectable` + - ``214`` + - Uncorrectable ECC error encountered + + * - :term:`hipErrorUnsupportedLimit` + - ``215`` + - Limit is not supported on this architecture + + * - :term:`hipErrorAssert` + - ``710`` + - Device-side assert triggered + +.. glossary:: + + hipErrorECCNotCorrectable + + Uncorrectable ECC error encountered. This hardware-level error occurs when the GPU's + Error-Correcting Code (ECC) mechanism detects memory corruption that cannot be automatically + corrected. Common scenarios include: + + * Physical hardware failure or degradation in GPU memory + * Overheating causing memory bit flips + * Running at extreme overclocked settings + * Aging hardware with declining reliability + * Power supply issues affecting memory integrity + + When this error occurs, the affected memory contents are unreliable and the operation cannot + continue safely. This error generally requires system intervention, and in persistent cases, + may indicate hardware that needs replacement. + + hipErrorUnsupportedLimit + + Limit is not supported on this architecture. This error occurs when attempting to query or + set a device limit that is not supported by the current hardware. Common scenarios include: + + * Using :cpp:func:`hipDeviceSetLimit()` with a limit type not supported by the hardware + * Requesting advanced features on entry-level or older GPU hardware + * Setting limits specific to one GPU architecture on a different architecture + * Using limit types introduced in newer HIP versions with older hardware + + This error indicates a hardware capability limitation rather than an application error. + + hipErrorAssert + + Device-side assert triggered. This error occurs when an assertion inside GPU kernel code + fails. Common scenarios include: + + * Explicit :cpp:func:`assert()` statement in device code evaluates to false + * Debug checks added by developers that detect invalid conditions + * Parameter validation in kernel code that failed + * Detected algorithmic errors or unexpected conditions + + This error is particularly useful for debugging as it explicitly indicates where a + programmer-defined condition was violated in device code. diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index 96e686b738..35ed57f0b6 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -109,6 +109,7 @@ subtrees: - file: reference/math_api - file: reference/complex_math_api - file: reference/env_variables + - file: reference/error_codes - file: reference/api_syntax - file: reference/deprecated_api_list title: List of deprecated APIs From 21e296c65bfaa21273e67b3bf471d40ed6438b6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= Date: Fri, 14 Mar 2025 09:41:24 +0100 Subject: [PATCH 20/32] Update docs: the compilation cache is enabled by default --- docs/how-to/hip_rtc.rst | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/how-to/hip_rtc.rst b/docs/how-to/hip_rtc.rst index 734bf60284..223e11081c 100644 --- a/docs/how-to/hip_rtc.rst +++ b/docs/how-to/hip_rtc.rst @@ -265,17 +265,17 @@ Use the following environment variables to manage the cache status as enabled or disabled, the location for storing the cache contents, and the cache eviction policy: -* ``AMD_COMGR_CACHE`` By default this variable has a value of ``0`` and the - compilation cache feature is disabled. To enable the feature set the - environment variable to a value of ``1`` (or any value other than ``0``). +* ``AMD_COMGR_CACHE`` By default this variable is unset and the + compilation cache feature is enabled. To disable the feature set the + environment variable to a value of ``0``. * ``AMD_COMGR_CACHE_DIR``: By default the value of this environment variable is - defined as ``$XDG_CACHE_HOME/comgr_cache``, which defaults to - ``$USER/.cache/comgr_cache`` on Linux, and ``%LOCALAPPDATA%\cache\comgr_cache`` + defined as ``$XDG_CACHE_HOME/comgr``, which defaults to + ``$USER/.cache/comgr`` on Linux, and ``%LOCALAPPDATA%\cache\comgr`` on Windows. You can specify a different directory for the environment variable to change the path for cache storage. If the runtime fails to access the - specified cache directory, or the environment variable is set to an empty - string (""), the cache is disabled. + specified cache directory the cache is disabled. If the environment variable + is set to an empty string (``""``), the default directory is used. * ``AMD_COMGR_CACHE_POLICY``: If assigned a value, the string is interpreted and applied to the cache pruning policy. The string format is consistent with From 6adac615d0e1fc06af45eac3faf3448e107b04a8 Mon Sep 17 00:00:00 2001 From: Istvan Kiss Date: Wed, 26 Mar 2025 18:06:16 +0100 Subject: [PATCH 21/32] Fix fns32 function mask type in doc --- docs/reference/math_api.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/reference/math_api.rst b/docs/reference/math_api.rst index 504054ff91..06902e2290 100644 --- a/docs/reference/math_api.rst +++ b/docs/reference/math_api.rst @@ -1483,7 +1483,7 @@ since they only apply to floating-point operations, not integer arithmetic. | Returns the position of the first set bit in a 64 bit signed integer. | Note: if ``x`` is ``0``, will return ``0`` - * - | ``unsigned int __fns32(unsigned long long mask, unsigned int base, int offset)`` + * - | ``unsigned int __fns32(unsigned int mask, unsigned int base, int offset)`` | Find the position of the n-th set to 1 bit in a 32-bit integer. | Note: this intrinsic is emulated via software, so performance can be potentially slower From 1bdf6e6a909466959ef86ac15acc681e67ddb55b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 3 Apr 2025 00:36:16 +0000 Subject: [PATCH 22/32] Bump rocm-docs-core[api_reference] from 1.18.1 to 1.18.2 in /docs/sphinx Bumps [rocm-docs-core[api_reference]](https://github.com/ROCm/rocm-docs-core) from 1.18.1 to 1.18.2. - [Release notes](https://github.com/ROCm/rocm-docs-core/releases) - [Changelog](https://github.com/ROCm/rocm-docs-core/blob/develop/CHANGELOG.md) - [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.18.1...v1.18.2) --- updated-dependencies: - dependency-name: rocm-docs-core[api_reference] dependency-version: 1.18.2 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- docs/sphinx/requirements.in | 2 +- docs/sphinx/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in index 07e229101b..14d231de12 100644 --- a/docs/sphinx/requirements.in +++ b/docs/sphinx/requirements.in @@ -1,2 +1,2 @@ -rocm-docs-core[api_reference]==1.18.1 +rocm-docs-core[api_reference]==1.18.2 sphinxcontrib.doxylink diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index 4f3afc36dc..ada67dda8a 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -211,7 +211,7 @@ requests==2.32.3 # via # pygithub # sphinx -rocm-docs-core[api-reference]==1.18.1 +rocm-docs-core[api-reference]==1.18.2 # via -r requirements.in rpds-py==0.22.3 # via From db37c56af7e7fa46c3ab6de69d7211b09e256bb8 Mon Sep 17 00:00:00 2001 From: Istvan Kiss Date: Tue, 8 Apr 2025 15:40:11 +0200 Subject: [PATCH 23/32] Fix readme link --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ed6db9581b..57ff69619b 100644 --- a/README.md +++ b/README.md @@ -64,7 +64,7 @@ hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost); The HIP kernel language defines builtins for determining grid and block coordinates, math functions, short vectors, atomics, and timer functions. -It also specifies additional defines and keywords for function types, address spaces, and optimization controls (See the [HIP C++ Language Extensions](docs/reference/cpp_language_extensions.rst) for a full description). +It also specifies additional defines and keywords for function types, address spaces, and optimization controls (See the [HIP C++ Language Extensions](docs/how-to/hip_cpp_language_extensions.rst) for a full description). Here's an example of defining a simple 'vector_square' kernel. ```cpp From 4617c98f57a73673769e3c285a73718fac56ba70 Mon Sep 17 00:00:00 2001 From: Adel Johar Date: Mon, 7 Apr 2025 16:16:58 +0200 Subject: [PATCH 24/32] Docs: Fix verbose paths generated by doxygen --- docs/doxygen/Doxyfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile index 6570128d00..fb4eaae2de 100644 --- a/docs/doxygen/Doxyfile +++ b/docs/doxygen/Doxyfile @@ -170,7 +170,8 @@ FULL_PATH_NAMES = YES # will be relative from the directory where doxygen is started. # This tag requires that the tag FULL_PATH_NAMES is set to YES. -STRIP_FROM_PATH = +STRIP_FROM_PATH = ../../ \ + ../../../ # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the # path mentioned in the documentation of a class, which tells the reader which From 1a4f35505b015d63d833ec48c23eeb9fe5003ecb Mon Sep 17 00:00:00 2001 From: Istvan Kiss Date: Thu, 10 Apr 2025 16:41:47 +0200 Subject: [PATCH 25/32] Fix CU and WGP mode effect on warpSize There is no effect on warpSize --- docs/how-to/hip_cpp_language_extensions.rst | 4 +--- docs/reference/hardware_features.rst | 10 ++++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/how-to/hip_cpp_language_extensions.rst b/docs/how-to/hip_cpp_language_extensions.rst index 4798b1d9c1..2cde72cfdc 100644 --- a/docs/how-to/hip_cpp_language_extensions.rst +++ b/docs/how-to/hip_cpp_language_extensions.rst @@ -411,9 +411,7 @@ warpSize ================================================================================ The ``warpSize`` constant contains the number of threads per warp for the given -target device. It can differ between different architectures, and on RDNA -architectures it can even differ between kernel launches, depending on whether -they run in CU or WGP mode. See the +target device. It can differ between different architectures, see the :doc:`hardware features <../reference/hardware_features>` for more information. diff --git a/docs/reference/hardware_features.rst b/docs/reference/hardware_features.rst index f5e227fc78..5bf3a74b81 100644 --- a/docs/reference/hardware_features.rst +++ b/docs/reference/hardware_features.rst @@ -240,10 +240,12 @@ page. - 106 - 104 -.. [1] RDNA architectures have a configurable wavefront size. The native - wavefront size is 32, but they can run in "CU mode", which has an effective - wavefront size of 64. This affects the number of resident wavefronts and - blocks per compute Unit. +.. [1] The RDNA architectures feature an experimental compiler option called + ``mwavefrontsize64``, which determines the wavefront size for kernel code + generation. When this option is disabled, the native wavefront size of 32 is + used, when enabled wavefront size 64 is used. This option is not supported by + the HIP runtime. + .. [2] RDNA architectures expand the concept of the traditional compute unit with the so-called work group processor, which effectively includes two compute units, within which all threads can cooperate. From e1b72a34ae732215c03ef6edc91a4a59f5c234dd Mon Sep 17 00:00:00 2001 From: Istvan Kiss Date: Mon, 14 Apr 2025 12:57:15 +0200 Subject: [PATCH 26/32] Use single example for installation tests --- docs/install/build.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/install/build.rst b/docs/install/build.rst index 64deba241b..49cf9adca6 100644 --- a/docs/install/build.rst +++ b/docs/install/build.rst @@ -238,4 +238,5 @@ Run HIP ================================================= After installation and building HIP, you can compile your application and run. -Simple examples can be found in the `ROCm-examples repository `_. +A simple SAXPY example can be found in the `ROCm-examples repository `_ +and the guide on how to build and run it is in the :doc:`SAXPY tutorial <../tutorial/saxpy>` From 4c301d72adf9ebbcbb9fb10c2f14bc9b7a273a28 Mon Sep 17 00:00:00 2001 From: Istvan Kiss Date: Tue, 15 Apr 2025 14:23:13 +0200 Subject: [PATCH 27/32] Update install and build instructions --- docs/install/build.rst | 254 ++++++++++++++++++++------------------- docs/install/install.rst | 112 ++++++++--------- 2 files changed, 187 insertions(+), 179 deletions(-) diff --git a/docs/install/build.rst b/docs/install/build.rst index 49cf9adca6..76903a81b9 100644 --- a/docs/install/build.rst +++ b/docs/install/build.rst @@ -9,27 +9,28 @@ Build HIP from source Prerequisites ================================================= -HIP code can be developed either on AMD ROCm platform using HIP-Clang compiler, or a CUDA platform with ``nvcc`` installed. -Before building and running HIP, make sure drivers and prebuilt packages are installed properly on the platform. +HIP code can be developed either on AMD ROCm platform using HIP-Clang compiler, +or a CUDA platform with ``nvcc`` installed. Before building and running HIP, +make sure drivers and prebuilt packages are installed properly on the platform. You also need to install Python 3, which includes the ``CppHeaderParser`` package. Install Python 3 using the following command: .. code-block:: shell - apt-get install python3 + apt-get install python3 Check and install ``CppHeaderParser`` package using the command: .. code-block:: shell - pip3 install CppHeaderParser + pip3 install CppHeaderParser Install ``ROCm LLVM`` package using the command: .. code-block:: shell - apt-get install rocm-llvm-dev + apt-get install rocm-llvm-dev .. _Building the HIP runtime: @@ -41,197 +42,200 @@ Set the repository branch using the variable: ``ROCM_BRANCH``. For example, for .. code-block:: shell - export ROCM_BRANCH=rocm-6.1.x + export ROCM_BRANCH=rocm-6.1.x .. tab-set:: - .. tab-item:: AMD - :sync: amd + .. tab-item:: AMD + :sync: amd - #. Get HIP source code. + #. Get HIP source code. - .. note:: - Starting in ROCM 5.6, CLR is a new repository that includes the former ROCclr, HIPAMD and - OpenCl repositories. OpenCL provides headers that ROCclr runtime depends on. + .. note:: + + Starting in ROCM 5.6, CLR is a new repository that includes the former ROCclr, HIPAMD and + OpenCl repositories. OpenCL provides headers that ROCclr runtime depends on. - .. note:: - Starting in ROCM 6.1, a new repository ``hipother`` is added to ROCm, which is branched out from HIP. - ``hipother`` provides files required to support the HIP back-end implementation on some non-AMD platforms, - like NVIDIA. + .. note:: - .. code-block:: shell + Starting in ROCM 6.1, a new repository ``hipother`` is added to ROCm, which is branched out from HIP. + ``hipother`` provides files required to support the HIP back-end implementation on some non-AMD platforms, + like NVIDIA. - git clone -b "$ROCM_BRANCH" https://github.com/ROCm/clr.git - git clone -b "$ROCM_BRANCH" https://github.com/ROCm/hip.git + .. code-block:: shell - CLR (Compute Language Runtime) repository includes ROCclr, HIPAMD and OpenCL. + git clone -b "$ROCM_BRANCH" https://github.com/ROCm/clr.git + git clone -b "$ROCM_BRANCH" https://github.com/ROCm/hip.git - ROCclr (ROCm Compute Language Runtime) is a virtual device interface which - is defined on the AMD platform. HIP runtime uses ROCclr to interact with different backends. + CLR (Compute Language Runtime) repository includes ROCclr, HIPAMD and OpenCL. - HIPAMD provides implementation specifically for HIP on the AMD platform. + ROCclr (ROCm Compute Language Runtime) is a virtual device interface which + is defined on the AMD platform. HIP runtime uses ROCclr to interact with different backends. - OpenCL provides headers that ROCclr runtime currently depends on. - hipother provides headers and implementation specifically for non-AMD HIP platforms, like NVIDIA. + HIPAMD provides implementation specifically for HIP on the AMD platform. - #. Set the environment variables. + OpenCL provides headers that ROCclr runtime currently depends on. + hipother provides headers and implementation specifically for non-AMD HIP platforms, like NVIDIA. - .. code-block:: shell + #. Set the environment variables. - export CLR_DIR="$(readlink -f clr)" - export HIP_DIR="$(readlink -f hip)" + .. code-block:: shell + export CLR_DIR="$(readlink -f clr)" + export HIP_DIR="$(readlink -f hip)" - #. Build HIP. - .. code-block:: shell + #. Build HIP. - cd "$CLR_DIR" - mkdir -p build; cd build - cmake -DHIP_COMMON_DIR=$HIP_DIR -DHIP_PLATFORM=amd -DCMAKE_PREFIX_PATH="/opt/rocm/" -DCMAKE_INSTALL_PREFIX=$PWD/install -DHIP_CATCH_TEST=0 -DCLR_BUILD_HIP=ON -DCLR_BUILD_OCL=OFF .. + .. code-block:: shell - make -j$(nproc) - sudo make install + cd "$CLR_DIR" + mkdir -p build; cd build + cmake -DHIP_COMMON_DIR=$HIP_DIR -DHIP_PLATFORM=amd -DCMAKE_PREFIX_PATH="/opt/rocm/" -DCMAKE_INSTALL_PREFIX=$PWD/install -DHIP_CATCH_TEST=0 -DCLR_BUILD_HIP=ON -DCLR_BUILD_OCL=OFF .. - .. note:: + make -j$(nproc) + sudo make install - Note, if you don't specify ``CMAKE_INSTALL_PREFIX``, the HIP runtime is installed at - ````. + .. note:: - By default, release version of HIP is built. If need debug version, you can put the option ``CMAKE_BUILD_TYPE=Debug`` in the command line. + Note, if you don't specify ``CMAKE_INSTALL_PREFIX``, the HIP runtime is installed at + ````. - Default paths and environment variables: + By default, release version of HIP is built. If need debug version, you can + put the option ``CMAKE_BUILD_TYPE=Debug`` in the command line. - * HIP is installed into ````. This can be overridden by setting the ``INSTALL_PREFIX`` as the command option. - environment variable. - * HSA is in ````. This can be overridden by setting the ``HSA_PATH`` - environment variable. - * Clang is in ``/llvm/bin``. This can be overridden by setting the - ``HIP_CLANG_PATH`` environment variable. - * The device library is in ``/lib``. This can be overridden by setting the - ``DEVICE_LIB_PATH`` environment variable. - * Optionally, you can add ``/bin`` to your ``PATH``, which can make it easier to - use the tools. - * Optionally, you can set ``HIPCC_VERBOSE=7`` to output the command line for compilation. + Default paths and environment variables: - After you run the ``make install`` command, HIP is installed to ```` by default, or ``$PWD/install/hip`` while ``INSTALL_PREFIX`` is defined. + * HIP is installed into ````. This can be overridden by setting the ``INSTALL_PREFIX`` as the command option. + + * HSA is in ````. This can be overridden by setting the ``HSA_PATH`` environment variable. + + * Clang is in ``/llvm/bin``. This can be overridden by setting the ``HIP_CLANG_PATH`` environment variable. + + * The device library is in ``/lib``. This can be overridden by setting the ``DEVICE_LIB_PATH`` environment variable. + + * Optionally, you can add ``/bin`` to your ``PATH``, which can make it easier to use the tools. + + * Optionally, you can set ``HIPCC_VERBOSE=7`` to output the command line for compilation. - #. Generate a profiling header after adding/changing a HIP API. + After you run the ``make install`` command, HIP is installed to ```` by default, or ``$PWD/install/hip`` while ``INSTALL_PREFIX`` is defined. - When you add or change a HIP API, you may need to generate a new ``hip_prof_str.h`` header. - This header is used by ROCm tools to track HIP APIs, such as ``rocprofiler`` and ``roctracer``. + #. Generate a profiling header after adding/changing a HIP API. - To generate the header after your change, use the ``hip_prof_gen.py`` tool located in - ``hipamd/src``. + When you add or change a HIP API, you may need to generate a new ``hip_prof_str.h`` header. + This header is used by ROCm tools to track HIP APIs, such as ``rocprofiler`` and ``roctracer``. - Usage: + To generate the header after your change, use the ``hip_prof_gen.py`` tool located in + ``hipamd/src``. - .. code-block:: shell + Usage: - `hip_prof_gen.py [-v] []` + .. code-block:: shell - Flags: + `hip_prof_gen.py [-v] []` - * ``-v``: Verbose messages - * ``-r``: Process source directory recursively - * ``-t``: API types matching check - * ``--priv``: Private API check - * ``-e``: On error exit mode - * ``-p``: ``HIP_INIT_API`` macro patching mode + Flags: - Example usage: + * ``-v``: Verbose messages + * ``-r``: Process source directory recursively + * ``-t``: API types matching check + * ``--priv``: Private API check + * ``-e``: On error exit mode + * ``-p``: ``HIP_INIT_API`` macro patching mode - .. code-block:: shell + Example usage: - hip_prof_gen.py -v -p -t --priv /include/hip/hip_runtime_api.h \ - /src /include/hip/amd_detail/hip_prof_str.h \ - /include/hip/amd_detail/hip_prof_str.h.new + .. code-block:: shell - .. tab-item:: NVIDIA - :sync: nvidia + hip_prof_gen.py -v -p -t --priv /include/hip/hip_runtime_api.h \ + /src /include/hip/amd_detail/hip_prof_str.h \ + /include/hip/amd_detail/hip_prof_str.h.new - #. Get the HIP source code. + .. tab-item:: NVIDIA + :sync: nvidia - .. code-block:: shell + #. Get the HIP source code. - git clone -b "$ROCM_BRANCH" https://github.com/ROCm/clr.git - git clone -b "$ROCM_BRANCH" https://github.com/ROCm/hip.git - git clone -b "$ROCM_BRANCH" https://github.com/ROCm/hipother.git + .. code-block:: shell - #. Set the environment variables. + git clone -b "$ROCM_BRANCH" https://github.com/ROCm/clr.git + git clone -b "$ROCM_BRANCH" https://github.com/ROCm/hip.git + git clone -b "$ROCM_BRANCH" https://github.com/ROCm/hipother.git - .. code-block:: shell + #. Set the environment variables. - export CLR_DIR="$(readlink -f clr)" - export HIP_DIR="$(readlink -f hip)" - export HIP_OTHER="$(readlink -f hipother)" + .. code-block:: shell - #. Build HIP. + export CLR_DIR="$(readlink -f clr)" + export HIP_DIR="$(readlink -f hip)" + export HIP_OTHER="$(readlink -f hipother)" - .. code-block:: shell + #. Build HIP. - cd "$CLR_DIR" - mkdir -p build; cd build - cmake -DHIP_COMMON_DIR=$HIP_DIR -DHIP_PLATFORM=nvidia -DCMAKE_INSTALL_PREFIX=$PWD/install -DHIP_CATCH_TEST=0 -DCLR_BUILD_HIP=ON -DCLR_BUILD_OCL=OFF -DHIPNV_DIR=$HIP_OTHER/hipnv .. - make -j$(nproc) - sudo make install + .. code-block:: shell + + cd "$CLR_DIR" + mkdir -p build; cd build + cmake -DHIP_COMMON_DIR=$HIP_DIR -DHIP_PLATFORM=nvidia -DCMAKE_INSTALL_PREFIX=$PWD/install -DHIP_CATCH_TEST=0 -DCLR_BUILD_HIP=ON -DCLR_BUILD_OCL=OFF -DHIPNV_DIR=$HIP_OTHER/hipnv .. + make -j$(nproc) + sudo make install Build HIP tests ================================================= .. tab-set:: - .. tab-item:: AMD - :sync: amd + .. tab-item:: AMD + :sync: amd - * Build HIP catch tests. + **Build HIP catch tests.** - HIP catch tests are separate from the HIP project and use Catch2. + HIP catch tests are separate from the HIP project and use Catch2. - * Get HIP tests source code. + #. Get HIP tests source code. - .. code-block:: shell + .. code-block:: shell - git clone -b "$ROCM_BRANCH" https://github.com/ROCm/hip-tests.git + git clone -b "$ROCM_BRANCH" https://github.com/ROCm/hip-tests.git - * Build HIP tests from source. + #. Build HIP tests from source. - .. code-block:: shell + .. code-block:: shell - export HIPTESTS_DIR="$(readlink -f hip-tests)" - cd "$HIPTESTS_DIR" - mkdir -p build; cd build - cmake ../catch -DHIP_PLATFORM=amd -DHIP_PATH=$CLR_DIR/build/install # or any path where HIP is installed; for example: ``/opt/rocm`` - make build_tests - ctest # run tests + export HIPTESTS_DIR="$(readlink -f hip-tests)" + cd "$HIPTESTS_DIR" + mkdir -p build; cd build + cmake ../catch -DHIP_PLATFORM=amd -DHIP_PATH=$CLR_DIR/build/install # or any path where HIP is installed; for example: ``/opt/rocm`` + make build_tests + ctest # run tests - HIP catch tests are built in ``$HIPTESTS_DIR/build``. + HIP catch tests are built in ``$HIPTESTS_DIR/build``. - To run any single catch test, use this example: + To run any single catch test, use this example: - .. code-block:: shell + .. code-block:: shell - cd $HIPTESTS_DIR/build/catch_tests/unit/texture - ./TextureTest + cd $HIPTESTS_DIR/build/catch_tests/unit/texture + ./TextureTest - * Build a HIP Catch2 standalone test. + #. Build a HIP Catch2 standalone test. (Optional) - .. code-block:: shell + .. code-block:: shell - cd "$HIPTESTS_DIR" - hipcc $HIPTESTS_DIR/catch/unit/memory/hipPointerGetAttributes.cc \ - -I ./catch/include ./catch/hipTestMain/standalone_main.cc \ - -I ./catch/external/Catch2 -o hipPointerGetAttributes - ./hipPointerGetAttributes - ... + cd "$HIPTESTS_DIR" + hipcc $HIPTESTS_DIR/catch/unit/memory/hipPointerGetAttributes.cc \ + -I ./catch/include ./catch/hipTestMain/standalone_main.cc \ + -I ./catch/external/Catch2 -o hipPointerGetAttributes + ./hipPointerGetAttributes + ... - All tests passed + All tests passed - .. tab-item:: NVIDIA - :sync: nvidia + .. tab-item:: NVIDIA + :sync: nvidia - The commands to build HIP tests on an NVIDIA platform are the same as on an AMD platform. - However, you must first set ``-DHIP_PLATFORM=nvidia``. + The commands to build HIP tests on an NVIDIA platform are the same as on an AMD platform. + However, you must first set ``-DHIP_PLATFORM=nvidia``. Run HIP diff --git a/docs/install/install.rst b/docs/install/install.rst index c5cafac663..522c935edc 100644 --- a/docs/install/install.rst +++ b/docs/install/install.rst @@ -10,10 +10,10 @@ HIP can be installed on AMD (ROCm with HIP-Clang) and NVIDIA (CUDA with NVCC) pl .. note:: - The version definition for the HIP runtime is different from CUDA. On AMD - platforms, the :cpp:func:`hipRuntimeGetVersion` function returns the HIP - runtime version. On NVIDIA platforms, this function returns the CUDA runtime - version. + The version definition for the HIP runtime is different from CUDA. On AMD + platforms, the :cpp:func:`hipRuntimeGetVersion` function returns the HIP + runtime version. On NVIDIA platforms, this function returns the CUDA runtime + version. .. _install_prerequisites: @@ -22,84 +22,88 @@ Prerequisites .. tab-set:: - .. tab-item:: AMD - :sync: amd + .. tab-item:: AMD + :sync: amd - Refer to the Prerequisites section in the ROCm install guides: + Refer to the Prerequisites section in the ROCm install guides: - * :doc:`rocm-install-on-linux:reference/system-requirements` - * :doc:`rocm-install-on-windows:reference/system-requirements` + * :doc:`rocm-install-on-linux:reference/system-requirements` + * :doc:`rocm-install-on-windows:reference/system-requirements` - .. tab-item:: NVIDIA - :sync: nvidia + .. tab-item:: NVIDIA + :sync: nvidia - With NVIDIA GPUs, HIP requires unified memory. All CUDA-enabled NVIDIA - GPUs with compute capability 5.0 or later should be supported. For more - information, see `NVIDIA's list of CUDA enabled GPUs `_. + With NVIDIA GPUs, HIP requires unified memory. All CUDA-enabled NVIDIA + GPUs with compute capability 5.0 or later should be supported. For more + information, see `NVIDIA's list of CUDA enabled GPUs `_. Installation ======================================= .. tab-set:: - .. tab-item:: AMD - :sync: amd + .. tab-item:: AMD + :sync: amd - HIP is automatically installed during the ROCm installation. If you haven't yet installed ROCm, you - can find installation instructions here: + HIP is automatically installed during the ROCm installation. If you haven't + yet installed ROCm, you can find installation instructions here: - * :doc:`rocm-install-on-linux:index` - * :doc:`rocm-install-on-windows:index` + * :doc:`rocm-install-on-linux:index` + * :doc:`rocm-install-on-windows:index` - By default, HIP is installed into ``/opt/rocm``. + By default, HIP is installed into ``/opt/rocm``. - .. note:: - There is no autodetection for the HIP installation. If you choose to install it somewhere other than the default location, you must set the ``HIP_PATH`` environment variable as explained in `Build HIP from source <./build.html>`_. + .. note:: + + There is no autodetection for the HIP installation. If you choose to + install it somewhere other than the default location, you must set the + ``HIP_PATH`` environment variable as explained in + `Build HIP from source <./build.html>`_. - .. tab-item:: NVIDIA - :sync: nvidia + .. tab-item:: NVIDIA + :sync: nvidia - #. Install the NVIDIA toolkit. + #. Install the NVIDIA toolkit. - The latest release can be found here: - `CUDA Toolkit `_. + The latest release can be found here: + `CUDA Toolkit `_. - #. Setup the radeon repo. + #. Setup the radeon repo. - .. code-block::shell + .. code-block::shell - # Replace url with appropriate link in the table below - wget https://repo.radeon.com/amdgpu-install/6.2/distro/version_name/amdgpu-install_6.2.60200-1_all.deb - sudo apt install ./amdgpu-install_6.2.60200-1_all.deb - sudo apt update + # Replace url with appropriate link in the table below + wget https://repo.radeon.com/amdgpu-install/6.2/distro/version_name/amdgpu-install_6.2.60200-1_all.deb + sudo apt install ./amdgpu-install_6.2.60200-1_all.deb + sudo apt update - .. list-table:: amdgpu-install links - :widths: 25 100 - :header-rows: 1 + .. list-table:: amdgpu-install links + :widths: 25 100 + :header-rows: 1 - * - Ubuntu version - - URL - * - 24.04 - - https://repo.radeon.com/amdgpu-install/6.2.4/ubuntu/noble/amdgpu-install_6.2.60204-1_all.deb - * - 22.04 - - https://repo.radeon.com/amdgpu-install/6.2.4/ubuntu/jammy/amdgpu-install_6.2.60204-1_all.deb + * - Ubuntu version + - URL + * - 24.04 + - https://repo.radeon.com/amdgpu-install/6.2.4/ubuntu/noble/amdgpu-install_6.2.60204-1_all.deb + * - 22.04 + - https://repo.radeon.com/amdgpu-install/6.2.4/ubuntu/jammy/amdgpu-install_6.2.60204-1_all.deb - #. Install the ``hip-runtime-nvidia`` and ``hip-dev`` packages. This installs the CUDA SDK and HIP - porting layer. + #. Install the ``hip-runtime-nvidia`` and ``hip-dev`` packages. This installs the CUDA SDK and HIP + porting layer. - .. code-block:: shell + .. code-block:: shell - apt-get install hip-runtime-nvidia hip-dev + apt-get install hip-runtime-nvidia hip-dev - The default paths are: - * CUDA SDK: ``/usr/local/cuda`` - * HIP: ``/opt/rocm`` + The default paths are: + * CUDA SDK: ``/usr/local/cuda`` + * HIP: ``/opt/rocm`` - #. Set the HIP_PLATFORM to nvidia. + #. Set the HIP_PLATFORM to nvidia. - .. code-block:: shell + .. code-block:: shell - export HIP_PLATFORM="nvidia" + export HIP_PLATFORM="nvidia" Verify your installation ========================================================== @@ -108,4 +112,4 @@ Run ``hipconfig`` in your installation path. .. code-block:: shell - /opt/rocm/bin/hipconfig --full + /opt/rocm/bin/hipconfig --full From 1e86f41b964858d53176417e30c70962d55e5c2a Mon Sep 17 00:00:00 2001 From: Istvan Kiss Date: Tue, 29 Apr 2025 19:18:45 +0200 Subject: [PATCH 28/32] Added not about HIP runtime build on windows --- docs/install/build.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/install/build.rst b/docs/install/build.rst index 76903a81b9..9e15251029 100644 --- a/docs/install/build.rst +++ b/docs/install/build.rst @@ -180,6 +180,11 @@ Set the repository branch using the variable: ``ROCM_BRANCH``. For example, for make -j$(nproc) sudo make install +.. note:: + + HIP runtime is not buildable on Windows as it depends on closed source + components. + Build HIP tests ================================================= From 68fb07be899333b4c0f359e8c54e4167feab2c8a Mon Sep 17 00:00:00 2001 From: Istvan Kiss Date: Wed, 7 May 2025 16:17:02 +0200 Subject: [PATCH 29/32] Update text --- docs/install/build.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/install/build.rst b/docs/install/build.rst index 9e15251029..b0a7baa43d 100644 --- a/docs/install/build.rst +++ b/docs/install/build.rst @@ -182,8 +182,7 @@ Set the repository branch using the variable: ``ROCM_BRANCH``. For example, for .. note:: - HIP runtime is not buildable on Windows as it depends on closed source - components. + The HIP runtime is only buildable on Linux. Build HIP tests ================================================= From 0fa0ec8aa1bad426a8d69a59f6d227e74832c52e Mon Sep 17 00:00:00 2001 From: Scott Date: Mon, 5 May 2025 15:26:12 -0700 Subject: [PATCH 30/32] Add missing newline character to include/hip/linker_types.h. --- include/hip/linker_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/hip/linker_types.h b/include/hip/linker_types.h index fd3d29d09a..505cdcf0f0 100755 --- a/include/hip/linker_types.h +++ b/include/hip/linker_types.h @@ -127,4 +127,4 @@ typedef enum hipJitFallback { #error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__"); #endif -#endif // HIP_INCLUDE_HIP_LINKER_TYPES_H \ No newline at end of file +#endif // HIP_INCLUDE_HIP_LINKER_TYPES_H From a6c064b497301a9773814ee64b0e87cb0fd4f2ac Mon Sep 17 00:00:00 2001 From: Istvan Kiss Date: Mon, 7 Apr 2025 08:25:31 +0200 Subject: [PATCH 31/32] Add lane masks bit-shift in the porting guide --- .wordlist.txt | 1 + docs/how-to/hip_porting_guide.rst | 64 +++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+) diff --git a/.wordlist.txt b/.wordlist.txt index 1bca54a941..6cbf374ae1 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -15,6 +15,7 @@ bfloat Bitcode bitcode bitcodes +bitmask blockDim blockIdx builtins diff --git a/docs/how-to/hip_porting_guide.rst b/docs/how-to/hip_porting_guide.rst index 136084f66b..8e2b0f2c5e 100644 --- a/docs/how-to/hip_porting_guide.rst +++ b/docs/how-to/hip_porting_guide.rst @@ -611,6 +611,70 @@ code, while the host can query it during runtime via the device properties. See the :ref:`HIP language extension for warpSize ` for information on how to write portable wave-aware code. +Lane masks bit-shift +================================================================================ + +A thread in a warp is also called a lane, and a lane mask is a bitmask where +each bit corresponds to a thread in a warp. A bit is 1 if the thread is active, +0 if it's inactive. Bit-shift operations are typically used to create lane masks +and on AMD GPUs the ``warpSize`` can differ between different architectures, +that's why it's essential to use correct bitmask type, when porting code. + +Example: + +.. code-block:: cpp + + // Get the thread's position in the warp + unsigned int laneId = threadIdx.x % warpSize; + + // Use lane ID for bit-shift + val & ((1 << (threadIdx.x % warpSize) )-1 ); + + // Shift 32 bit integer with val variable + WarpReduce::sum( (val < warpSize) ? (1 << val) : 0); + +Lane masks are 32-bit integer types as this is the integer precision that C +assigns to such constants by default. GCN/CDNA architectures have a warp size of +64, :code:`threadIdx.x % warpSize` and :code:`val` in the example may obtain +values greater than 31. Consequently, shifting by such values would clear the +32-bit register to which the shift operation is applied. For AMD +architectures, a straightforward fix could look as follows: + +.. code-block:: cpp + + // Get the thread's position in the warp + unsigned int laneId = threadIdx.x % warpSize; + + // Use lane ID for bit-shift + val & ((1ull << (threadIdx.x % warpSize) )-1 ); + + // Shift 64 bit integer with val variable + WarpReduce::sum( (val < warpSize) ? (1ull << val) : 0); + +For portability reasons, it is better to introduce appropriately +typed placeholders as shown below: + +.. code-block:: cpp + + #if defined(__GFX8__) || defined(__GFX9__) + typedef uint64_t lane_mask_t; + #else + typedef uint32_t lane_mask_t; + #endif + +The use of :code:`lane_mask_t` with the previous example: + +.. code-block:: cpp + + // Get the thread's position in the warp + unsigned int laneId = threadIdx.x % warpSize; + + // Use lane ID for bit-shift + val & ((lane_mask_t{1} << (threadIdx.x % warpSize) )-1 ); + + // Shift 32 or 64 bit integer with val variable + WarpReduce::sum( (val < warpSize) ? (lane_mask_t{1} << val) : 0); + Porting from CUDA __launch_bounds__ ================================================================================ From 6a1d3dbcac6179853f31ae55c84652b8521eae7c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 2 Jun 2025 00:12:30 +0000 Subject: [PATCH 32/32] Bump rocm-docs-core[api_reference] from 1.18.2 to 1.20.0 in /docs/sphinx Bumps [rocm-docs-core[api_reference]](https://github.com/ROCm/rocm-docs-core) from 1.18.2 to 1.20.0. - [Release notes](https://github.com/ROCm/rocm-docs-core/releases) - [Changelog](https://github.com/ROCm/rocm-docs-core/blob/develop/CHANGELOG.md) - [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.18.2...v1.20.0) --- updated-dependencies: - dependency-name: rocm-docs-core[api_reference] dependency-version: 1.20.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- docs/sphinx/requirements.in | 2 +- docs/sphinx/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in index 14d231de12..91d93b71fe 100644 --- a/docs/sphinx/requirements.in +++ b/docs/sphinx/requirements.in @@ -1,2 +1,2 @@ -rocm-docs-core[api_reference]==1.18.2 +rocm-docs-core[api_reference]==1.20.0 sphinxcontrib.doxylink diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index ada67dda8a..6fa980c2ff 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -211,7 +211,7 @@ requests==2.32.3 # via # pygithub # sphinx -rocm-docs-core[api-reference]==1.18.2 +rocm-docs-core[api-reference]==1.20.0 # via -r requirements.in rpds-py==0.22.3 # via