Skip to content

Commit 73f5c14

Browse files
committed
Merge branch 'utils' into hipMemAdvise_tests
2 parents 8ddd164 + 35f373e commit 73f5c14

File tree

126 files changed

+660
-213
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

126 files changed

+660
-213
lines changed

bin/hipcc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ my $isWindows = ($^O eq 'MSWin32' or $^O eq 'msys');
3535
# escapes args with quotes SWDEV-341955
3636
foreach $arg (@ARGV) {
3737
if ($isWindows) {
38-
$arg =~ s/[^-a-zA-Z0-9_=+,.:\/\\]/\\$&/g;
38+
$arg =~ s/[^-a-zA-Z0-9_=+,.:\/\\ ]/\\$&/g;
3939
}
4040
}
4141

bin/hipcc.pl

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,19 @@
5252
exit(-1);
5353
}
5454

55+
# retrieve --rocm-path hipcc option from command line.
56+
# We need to respect this over the env var ROCM_PATH for this compilation.
57+
sub get_rocm_path_option {
58+
my $rocm_path="";
59+
my @CLArgs = @ARGV;
60+
foreach $arg (@CLArgs) {
61+
if (index($arg,"--rocm-path=") != -1) {
62+
($rocm_path) = $arg=~ /=\s*(.*)\s*$/;
63+
}
64+
}
65+
return $rocm_path;
66+
}
67+
5568
$verbose = $ENV{'HIPCC_VERBOSE'} // 0;
5669
# Verbose: 0x1=commands, 0x2=paths, 0x4=hipcc args
5770

@@ -88,12 +101,18 @@ sub delete_temp_dirs {
88101
}
89102

90103
my $base_dir;
104+
my $rocmPath;
91105
BEGIN {
92106
$base_dir = dirname(Cwd::realpath(__FILE__) );
107+
$rocmPath = get_rocm_path_option();
108+
if ($rocmPath ne '') {
109+
# --rocm-path takes precedence over ENV{ROCM_PATH}
110+
$ENV{ROCM_PATH}=$rocmPath;
111+
}
93112
}
94113
use lib "$base_dir/";
95-
use hipvars;
96114

115+
use hipvars;
97116
$isWindows = $hipvars::isWindows;
98117
$HIP_RUNTIME = $hipvars::HIP_RUNTIME;
99118
$HIP_PLATFORM = $hipvars::HIP_PLATFORM;
@@ -165,9 +184,6 @@ BEGIN
165184
$HIP_CLANG_TARGET = `$HIPCC -print-target-triple`;
166185
chomp($HIP_CLANG_TARGET);
167186

168-
if (! defined $HIP_CLANG_INCLUDE_PATH) {
169-
$HIP_CLANG_INCLUDE_PATH = abs_path("$HIP_CLANG_PATH/../lib/clang/$HIP_CLANG_VERSION/include");
170-
}
171187
if (! defined $HIP_INCLUDE_PATH) {
172188
$HIP_INCLUDE_PATH = "$HIP_PATH/include";
173189
}
@@ -180,15 +196,12 @@ BEGIN
180196
print ("HIP_ROCCLR_HOME=$HIP_ROCCLR_HOME\n");
181197
}
182198
print ("HIP_CLANG_PATH=$HIP_CLANG_PATH\n");
183-
print ("HIP_CLANG_INCLUDE_PATH=$HIP_CLANG_INCLUDE_PATH\n");
184199
print ("HIP_INCLUDE_PATH=$HIP_INCLUDE_PATH\n");
185200
print ("HIP_LIB_PATH=$HIP_LIB_PATH\n");
186201
print ("DEVICE_LIB_PATH=$DEVICE_LIB_PATH\n");
187202
print ("HIP_CLANG_TARGET=$HIP_CLANG_TARGET\n");
188203
}
189204

190-
$HIPCXXFLAGS .= " -isystem \"$HIP_CLANG_INCLUDE_PATH/..\"";
191-
$HIPCFLAGS .= " -isystem \"$HIP_CLANG_INCLUDE_PATH/..\"";
192205
$HIPLDFLAGS .= " -L\"$HIP_LIB_PATH\"";
193206
if ($isWindows) {
194207
$HIPLDFLAGS .= " -lamdhip64";

docs/markdown/hip_porting_guide.md

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -468,40 +468,43 @@ int main()
468468

469469
## CU_POINTER_ATTRIBUTE_MEMORY_TYPE
470470

471-
To get pointer's memory type in HIP/HIP-Clang, developers should use hipPointerGetAttributes API. First parameter of the API is hipPointerAttribute_t which has 'memoryType' as member variable. 'memoryType' indicates input pointer is allocated on device or host.
471+
To get pointer's memory type in HIP/HIP-Clang, developers should use hipPointerGetAttributes API. First parameter of the API is hipPointerAttribute_t which has 'type' as member variable. 'type' indicates input pointer is allocated on device or host.
472472

473473
For example:
474474
```
475475
double * ptr;
476476
hipMalloc(reinterpret_cast<void**>(&ptr), sizeof(double));
477477
hipPointerAttribute_t attr;
478-
hipPointerGetAttributes(&attr, ptr); /*attr.memoryType will have value as hipMemoryTypeDevice*/
478+
hipPointerGetAttributes(&attr, ptr); /*attr.type will have value as hipMemoryTypeDevice*/
479479
480480
double* ptrHost;
481481
hipHostMalloc(&ptrHost, sizeof(double));
482482
hipPointerAttribute_t attr;
483-
hipPointerGetAttributes(&attr, ptrHost); /*attr.memoryType will have value as hipMemoryTypeHost*/
483+
hipPointerGetAttributes(&attr, ptrHost); /*attr.type will have value as hipMemoryTypeHost*/
484484
```
485485
Please note, hipMemoryType enum values are different from cudaMemoryType enum values.
486486

487-
For example, on AMD platform, memoryType is defined in hip_runtime_api.h,
487+
For example, on AMD platform, hipMemoryType is defined in hip_runtime_api.h,
488+
```
488489
typedef enum hipMemoryType {
489-
hipMemoryTypeHost, ///< Memory is physically located on host
490-
hipMemoryTypeDevice, ///< Memory is physically located on device.
491-
hipMemoryTypeArray, ///< Array memory, physically located on device.
492-
hipMemoryTypeUnified ///< Not used currently
490+
hipMemoryTypeHost = 0, ///< Memory is physically located on host
491+
hipMemoryTypeDevice = 1, ///< Memory is physically located on device. (see deviceId for specific device)
492+
hipMemoryTypeArray = 2, ///< Array memory, physically located on device. (see deviceId for specific device)
493+
hipMemoryTypeUnified = 3, ///< Not used currently
494+
hipMemoryTypeManaged = 4 ///< Managed memory, automaticallly managed by the unified memory system
493495
} hipMemoryType;
494-
495-
Looking into CUDA toolkit, it defines memoryType as following,
496+
```
497+
Looking into CUDA toolkit, it defines cudaMemoryType as following,
498+
```
496499
enum cudaMemoryType
497500
{
498501
cudaMemoryTypeUnregistered = 0, // Unregistered memory.
499502
cudaMemoryTypeHost = 1, // Host memory.
500503
cudaMemoryTypeDevice = 2, // Device memory.
501504
cudaMemoryTypeManaged = 3, // Managed memory
502505
}
503-
504-
In this case, memoryType translation for hipPointerGetAttributes needs to be handled properly on nvidia platform to get the correct memory type in CUDA, which is done in the file nvidia_hip_runtime_api.h.
506+
```
507+
In this case, memory type translation for hipPointerGetAttributes needs to be handled properly on nvidia platform to get the correct memory type in CUDA, which is done in the file nvidia_hip_runtime_api.h.
505508

506509
So in any HIP applications which use HIP APIs involving memory types, developers should use #ifdef in order to assign the correct enum values depending on Nvidia or AMD platform.
507510

include/hip/hip_runtime_api.h

Lines changed: 159 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -153,24 +153,31 @@ typedef struct hipDeviceProp_t {
153153
} hipDeviceProp_t;
154154

155155

156-
/**
157-
* Memory type (for pointer attributes)
156+
/*
157+
* @brief HIP Memory type (for pointer attributes)
158+
* @enum
159+
* @ingroup Enumerations
158160
*/
159161
typedef enum hipMemoryType {
160-
hipMemoryTypeHost, ///< Memory is physically located on host
161-
hipMemoryTypeDevice, ///< Memory is physically located on device. (see deviceId for specific
162-
///< device)
163-
hipMemoryTypeArray, ///< Array memory, physically located on device. (see deviceId for specific
164-
///< device)
165-
hipMemoryTypeUnified, ///< Not used currently
166-
hipMemoryTypeManaged ///< Managed memory, automaticallly managed by the unified memory system
162+
hipMemoryTypeHost = 0, ///< Memory is physically located on host
163+
hipMemoryTypeDevice = 1, ///< Memory is physically located on device. (see deviceId for
164+
///< specific device)
165+
hipMemoryTypeArray = 2, ///< Array memory, physically located on device. (see deviceId for
166+
///< specific device)
167+
hipMemoryTypeUnified = 3, ///< Not used currently
168+
hipMemoryTypeManaged = 4 ///< Managed memory, automaticallly managed by the unified
169+
///< memory system
167170
} hipMemoryType;
168171

169172
/**
170173
* Pointer attributes
171174
*/
172175
typedef struct hipPointerAttribute_t {
173-
enum hipMemoryType memoryType;
176+
union {
177+
// Deprecated, use instead type
178+
enum hipMemoryType memoryType;
179+
enum hipMemoryType type;
180+
};
174181
int device;
175182
void* devicePointer;
176183
void* hostPointer;
@@ -6768,6 +6775,148 @@ inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
67686775
return hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
67696776
numBlocks, reinterpret_cast<const void*>(f), blockSize, dynSharedMemPerBlk, flags);
67706777
}
6778+
/**
6779+
* @brief Returns grid and block size that achieves maximum potential occupancy for a device function
6780+
*
6781+
* Returns in \p *min_grid_size and \p *block_size a suggested grid /
6782+
* block size pair that achieves the best potential occupancy
6783+
* (i.e. the maximum number of active warps on the current device with the smallest number
6784+
* of blocks for a particular function).
6785+
*
6786+
* @param [out] min_grid_size minimum grid size needed to achieve the best potential occupancy
6787+
* @param [out] block_size block size required for the best potential occupancy
6788+
* @param [in] func device function symbol
6789+
* @param [in] block_size_to_dynamic_smem_size - a unary function/functor that takes block size,
6790+
* and returns the size, in bytes, of dynamic shared memory needed for a block
6791+
* @param [in] block_size_limit the maximum block size \p func is designed to work with. 0 means no limit.
6792+
* @param [in] flags reserved
6793+
*
6794+
* @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidDeviceFunction, #hipErrorInvalidValue,
6795+
* #hipErrorUnknown
6796+
*/
6797+
template<typename UnaryFunction, class T>
6798+
static hipError_t __host__ inline hipOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(
6799+
int* min_grid_size,
6800+
int* block_size,
6801+
T func,
6802+
UnaryFunction block_size_to_dynamic_smem_size,
6803+
int block_size_limit = 0,
6804+
unsigned int flags = 0) {
6805+
if (min_grid_size == nullptr || block_size == nullptr ||
6806+
reinterpret_cast<const void*>(func) == nullptr) {
6807+
return hipErrorInvalidValue;
6808+
}
6809+
6810+
int dev;
6811+
hipError_t status;
6812+
if ((status = hipGetDevice(&dev)) != hipSuccess) {
6813+
return status;
6814+
}
6815+
6816+
int max_threads_per_cu;
6817+
if ((status = hipDeviceGetAttribute(&max_threads_per_cu,
6818+
hipDeviceAttributeMaxThreadsPerMultiProcessor, dev)) != hipSuccess) {
6819+
return status;
6820+
}
6821+
6822+
int warp_size;
6823+
if ((status = hipDeviceGetAttribute(&warp_size,
6824+
hipDeviceAttributeWarpSize, dev)) != hipSuccess) {
6825+
return status;
6826+
}
6827+
6828+
int max_cu_count;
6829+
if ((status = hipDeviceGetAttribute(&max_cu_count,
6830+
hipDeviceAttributeMultiprocessorCount, dev)) != hipSuccess) {
6831+
return status;
6832+
}
6833+
6834+
struct hipFuncAttributes attr;
6835+
if ((status = hipFuncGetAttributes(&attr, reinterpret_cast<const void*>(func))) != hipSuccess) {
6836+
return status;
6837+
}
6838+
6839+
// Initial limits for the execution
6840+
const int func_max_threads_per_block = attr.maxThreadsPerBlock;
6841+
if (block_size_limit == 0) {
6842+
block_size_limit = func_max_threads_per_block;
6843+
}
6844+
6845+
if (func_max_threads_per_block < block_size_limit) {
6846+
block_size_limit = func_max_threads_per_block;
6847+
}
6848+
6849+
const int block_size_limit_aligned =
6850+
((block_size_limit + (warp_size - 1)) / warp_size) * warp_size;
6851+
6852+
// For maximum search
6853+
int max_threads = 0;
6854+
int max_block_size{};
6855+
int max_num_blocks{};
6856+
for (int block_size_check_aligned = block_size_limit_aligned;
6857+
block_size_check_aligned > 0;
6858+
block_size_check_aligned -= warp_size) {
6859+
// Make sure the logic uses the requested limit and not aligned
6860+
int block_size_check = (block_size_limit < block_size_check_aligned) ?
6861+
block_size_limit : block_size_check_aligned;
6862+
6863+
size_t dyn_smem_size = block_size_to_dynamic_smem_size(block_size_check);
6864+
int optimal_blocks;
6865+
if ((status = hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
6866+
&optimal_blocks, func, block_size_check, dyn_smem_size, flags)) != hipSuccess) {
6867+
return status;
6868+
}
6869+
6870+
int total_threads = block_size_check * optimal_blocks;
6871+
if (total_threads > max_threads) {
6872+
max_block_size = block_size_check;
6873+
max_num_blocks = optimal_blocks;
6874+
max_threads = total_threads;
6875+
}
6876+
6877+
// Break if the logic reached possible maximum
6878+
if (max_threads_per_cu == max_threads) {
6879+
break;
6880+
}
6881+
}
6882+
6883+
// Grid size is the number of blocks per CU * CU count
6884+
*min_grid_size = max_num_blocks * max_cu_count;
6885+
*block_size = max_block_size;
6886+
6887+
return status;
6888+
}
6889+
6890+
/**
6891+
* @brief Returns grid and block size that achieves maximum potential occupancy for a device function
6892+
*
6893+
* Returns in \p *min_grid_size and \p *block_size a suggested grid /
6894+
* block size pair that achieves the best potential occupancy
6895+
* (i.e. the maximum number of active warps on the current device with the smallest number
6896+
* of blocks for a particular function).
6897+
*
6898+
* @param [out] min_grid_size minimum grid size needed to achieve the best potential occupancy
6899+
* @param [out] block_size block size required for the best potential occupancy
6900+
* @param [in] func device function symbol
6901+
* @param [in] block_size_to_dynamic_smem_size - a unary function/functor that takes block size,
6902+
* and returns the size, in bytes, of dynamic shared memory needed for a block
6903+
* @param [in] block_size_limit the maximum block size \p func is designed to work with. 0 means no limit.
6904+
*
6905+
* @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidDeviceFunction, #hipErrorInvalidValue,
6906+
* #hipErrorUnknown
6907+
*/
6908+
template<typename UnaryFunction, class T>
6909+
static hipError_t __host__ inline hipOccupancyMaxPotentialBlockSizeVariableSMem(
6910+
int* min_grid_size,
6911+
int* block_size,
6912+
T func,
6913+
UnaryFunction block_size_to_dynamic_smem_size,
6914+
int block_size_limit = 0)
6915+
{
6916+
return hipOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(min_grid_size, block_size, func,
6917+
block_size_to_dynamic_smem_size, block_size_limit);
6918+
}
6919+
67716920
template <typename F>
67726921
inline hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
67736922
F kernel, size_t dynSharedMemPerBlk, uint32_t blockSizeLimit) {

tests/catch/ABM/AddKernels/add.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ TEMPLATE_TEST_CASE("ABM_AddKernel_MultiTypeMultiSize", "", int, long, float, lon
2929
REQUIRE(res == hipSuccess);
3030

3131
hipLaunchKernelGGL(add<TestType>, 1, size, 0, 0, d_a, d_b, d_c, size);
32+
HIP_CHECK(hipGetLastError());
3233

3334
res = hipMemcpy(a.data(), d_c, sizeof(TestType) * size, hipMemcpyDeviceToHost);
3435
REQUIRE(res == hipSuccess);

tests/catch/TypeQualifiers/hipManagedKeyword.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ TEST_CASE("Unit_hipManagedKeyword_SingleGpu") {
5353
hipLaunchKernelGGL(add, dimGrid, dimBlock, 0, 0, static_cast<const float*>(A),
5454
static_cast<float*>(B));
5555

56+
HIP_CHECK(hipGetLastError());
5657
HIP_CHECK(hipDeviceSynchronize());
5758

5859
float maxError = 0.0f;

0 commit comments

Comments
 (0)