@@ -153,24 +153,31 @@ typedef struct hipDeviceProp_t {
153153} hipDeviceProp_t;
154154
155155
156- /* *
157- * Memory type (for pointer attributes)
156+ /*
157+ * @brief HIP Memory type (for pointer attributes)
158+ * @enum
159+ * @ingroup Enumerations
158160 */
159161typedef enum hipMemoryType {
160- hipMemoryTypeHost, // /< Memory is physically located on host
161- hipMemoryTypeDevice, // /< Memory is physically located on device. (see deviceId for specific
162- // /< device)
163- hipMemoryTypeArray, // /< Array memory, physically located on device. (see deviceId for specific
164- // /< device)
165- hipMemoryTypeUnified, // /< Not used currently
166- hipMemoryTypeManaged // /< Managed memory, automaticallly managed by the unified memory system
162+ hipMemoryTypeHost = 0 , // /< Memory is physically located on host
163+ hipMemoryTypeDevice = 1 , // /< Memory is physically located on device. (see deviceId for
164+ // /< specific device)
165+ hipMemoryTypeArray = 2 , // /< Array memory, physically located on device. (see deviceId for
166+ // /< specific device)
167+ hipMemoryTypeUnified = 3 , // /< Not used currently
168+ hipMemoryTypeManaged = 4 // /< Managed memory, automaticallly managed by the unified
169+ // /< memory system
167170} hipMemoryType;
168171
169172/* *
170173 * Pointer attributes
171174 */
172175typedef struct hipPointerAttribute_t {
173- enum hipMemoryType memoryType;
176+ union {
177+ // Deprecated, use instead type
178+ enum hipMemoryType memoryType;
179+ enum hipMemoryType type;
180+ };
174181 int device;
175182 void * devicePointer;
176183 void * hostPointer;
@@ -6768,6 +6775,148 @@ inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
67686775 return hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags (
67696776 numBlocks, reinterpret_cast <const void *>(f), blockSize, dynSharedMemPerBlk, flags);
67706777}
6778+ /* *
6779+ * @brief Returns grid and block size that achieves maximum potential occupancy for a device function
6780+ *
6781+ * Returns in \p *min_grid_size and \p *block_size a suggested grid /
6782+ * block size pair that achieves the best potential occupancy
6783+ * (i.e. the maximum number of active warps on the current device with the smallest number
6784+ * of blocks for a particular function).
6785+ *
6786+ * @param [out] min_grid_size minimum grid size needed to achieve the best potential occupancy
6787+ * @param [out] block_size block size required for the best potential occupancy
6788+ * @param [in] func device function symbol
6789+ * @param [in] block_size_to_dynamic_smem_size - a unary function/functor that takes block size,
6790+ * and returns the size, in bytes, of dynamic shared memory needed for a block
6791+ * @param [in] block_size_limit the maximum block size \p func is designed to work with. 0 means no limit.
6792+ * @param [in] flags reserved
6793+ *
6794+ * @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidDeviceFunction, #hipErrorInvalidValue,
6795+ * #hipErrorUnknown
6796+ */
6797+ template <typename UnaryFunction, class T >
6798+ static hipError_t __host__ inline hipOccupancyMaxPotentialBlockSizeVariableSMemWithFlags (
6799+ int * min_grid_size,
6800+ int * block_size,
6801+ T func,
6802+ UnaryFunction block_size_to_dynamic_smem_size,
6803+ int block_size_limit = 0 ,
6804+ unsigned int flags = 0 ) {
6805+ if (min_grid_size == nullptr || block_size == nullptr ||
6806+ reinterpret_cast <const void *>(func) == nullptr ) {
6807+ return hipErrorInvalidValue;
6808+ }
6809+
6810+ int dev;
6811+ hipError_t status;
6812+ if ((status = hipGetDevice (&dev)) != hipSuccess) {
6813+ return status;
6814+ }
6815+
6816+ int max_threads_per_cu;
6817+ if ((status = hipDeviceGetAttribute (&max_threads_per_cu,
6818+ hipDeviceAttributeMaxThreadsPerMultiProcessor, dev)) != hipSuccess) {
6819+ return status;
6820+ }
6821+
6822+ int warp_size;
6823+ if ((status = hipDeviceGetAttribute (&warp_size,
6824+ hipDeviceAttributeWarpSize, dev)) != hipSuccess) {
6825+ return status;
6826+ }
6827+
6828+ int max_cu_count;
6829+ if ((status = hipDeviceGetAttribute (&max_cu_count,
6830+ hipDeviceAttributeMultiprocessorCount, dev)) != hipSuccess) {
6831+ return status;
6832+ }
6833+
6834+ struct hipFuncAttributes attr;
6835+ if ((status = hipFuncGetAttributes (&attr, reinterpret_cast <const void *>(func))) != hipSuccess) {
6836+ return status;
6837+ }
6838+
6839+ // Initial limits for the execution
6840+ const int func_max_threads_per_block = attr.maxThreadsPerBlock ;
6841+ if (block_size_limit == 0 ) {
6842+ block_size_limit = func_max_threads_per_block;
6843+ }
6844+
6845+ if (func_max_threads_per_block < block_size_limit) {
6846+ block_size_limit = func_max_threads_per_block;
6847+ }
6848+
6849+ const int block_size_limit_aligned =
6850+ ((block_size_limit + (warp_size - 1 )) / warp_size) * warp_size;
6851+
6852+ // For maximum search
6853+ int max_threads = 0 ;
6854+ int max_block_size{};
6855+ int max_num_blocks{};
6856+ for (int block_size_check_aligned = block_size_limit_aligned;
6857+ block_size_check_aligned > 0 ;
6858+ block_size_check_aligned -= warp_size) {
6859+ // Make sure the logic uses the requested limit and not aligned
6860+ int block_size_check = (block_size_limit < block_size_check_aligned) ?
6861+ block_size_limit : block_size_check_aligned;
6862+
6863+ size_t dyn_smem_size = block_size_to_dynamic_smem_size (block_size_check);
6864+ int optimal_blocks;
6865+ if ((status = hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags (
6866+ &optimal_blocks, func, block_size_check, dyn_smem_size, flags)) != hipSuccess) {
6867+ return status;
6868+ }
6869+
6870+ int total_threads = block_size_check * optimal_blocks;
6871+ if (total_threads > max_threads) {
6872+ max_block_size = block_size_check;
6873+ max_num_blocks = optimal_blocks;
6874+ max_threads = total_threads;
6875+ }
6876+
6877+ // Break if the logic reached possible maximum
6878+ if (max_threads_per_cu == max_threads) {
6879+ break ;
6880+ }
6881+ }
6882+
6883+ // Grid size is the number of blocks per CU * CU count
6884+ *min_grid_size = max_num_blocks * max_cu_count;
6885+ *block_size = max_block_size;
6886+
6887+ return status;
6888+ }
6889+
6890+ /* *
6891+ * @brief Returns grid and block size that achieves maximum potential occupancy for a device function
6892+ *
6893+ * Returns in \p *min_grid_size and \p *block_size a suggested grid /
6894+ * block size pair that achieves the best potential occupancy
6895+ * (i.e. the maximum number of active warps on the current device with the smallest number
6896+ * of blocks for a particular function).
6897+ *
6898+ * @param [out] min_grid_size minimum grid size needed to achieve the best potential occupancy
6899+ * @param [out] block_size block size required for the best potential occupancy
6900+ * @param [in] func device function symbol
6901+ * @param [in] block_size_to_dynamic_smem_size - a unary function/functor that takes block size,
6902+ * and returns the size, in bytes, of dynamic shared memory needed for a block
6903+ * @param [in] block_size_limit the maximum block size \p func is designed to work with. 0 means no limit.
6904+ *
6905+ * @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidDeviceFunction, #hipErrorInvalidValue,
6906+ * #hipErrorUnknown
6907+ */
6908+ template <typename UnaryFunction, class T >
6909+ static hipError_t __host__ inline hipOccupancyMaxPotentialBlockSizeVariableSMem (
6910+ int * min_grid_size,
6911+ int * block_size,
6912+ T func,
6913+ UnaryFunction block_size_to_dynamic_smem_size,
6914+ int block_size_limit = 0 )
6915+ {
6916+ return hipOccupancyMaxPotentialBlockSizeVariableSMemWithFlags (min_grid_size, block_size, func,
6917+ block_size_to_dynamic_smem_size, block_size_limit);
6918+ }
6919+
67716920template <typename F>
67726921inline hipError_t hipOccupancyMaxPotentialBlockSize (int * gridSize, int * blockSize,
67736922 F kernel, size_t dynSharedMemPerBlk, uint32_t blockSizeLimit) {
0 commit comments