ROCm
diff --git a/‎bin/hipcc‎
Lines changed: 1 addition & 1 deletion b/‎bin/hipcc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bin/hipcc.pl‎
Lines changed: 20 additions & 7 deletions b/‎bin/hipcc.pl‎
Lines changed: 20 additions & 7 deletions
diff --git a/‎docs/markdown/hip_porting_guide.md‎
Lines changed: 15 additions & 12 deletions b/‎docs/markdown/hip_porting_guide.md‎
Lines changed: 15 additions & 12 deletions
diff --git a/‎include/hip/hip_runtime_api.h‎
Lines changed: 159 additions & 10 deletions b/‎include/hip/hip_runtime_api.h‎
Lines changed: 159 additions & 10 deletions
diff --git a/‎tests/catch/ABM/AddKernels/add.cc‎
Lines changed: 1 addition & 0 deletions b/‎tests/catch/ABM/AddKernels/add.cc‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/catch/TypeQualifiers/hipManagedKeyword.cc‎
Lines changed: 1 addition & 0 deletions b/‎tests/catch/TypeQualifiers/hipManagedKeyword.cc‎
Lines changed: 1 addition & 0 deletions
@@ -35,7 +35,7 @@ my $isWindows =  ($^O eq 'MSWin32' or $^O eq 'msys');
 # escapes args with quotes SWDEV-341955
 foreach $arg (@ARGV) {
   if ($isWindows) {
-    $arg =~ s/[^-a-zA-Z0-9_=+,.:\/\\]/\\$&/g;
+    $arg =~ s/[^-a-zA-Z0-9_=+,.:\/\\ ]/\\$&/g;
   }
 }
 
 
@@ -52,6 +52,19 @@
     exit(-1);
 }
 
+# retrieve --rocm-path hipcc option from command line.
+# We need to respect this over the env var ROCM_PATH for this compilation.
+sub get_rocm_path_option {
+  my $rocm_path="";
+  my @CLArgs = @ARGV;
+  foreach $arg (@CLArgs) {
+    if (index($arg,"--rocm-path=") != -1) {
+      ($rocm_path) = $arg=~ /=\s*(.*)\s*$/;
+    }
+  }
+  return $rocm_path;
+}
+
 $verbose = $ENV{'HIPCC_VERBOSE'} // 0;
 # Verbose: 0x1=commands, 0x2=paths, 0x4=hipcc args
 
@@ -88,12 +101,18 @@ sub delete_temp_dirs {
 }
 
 my $base_dir;
+my $rocmPath;
 BEGIN {
     $base_dir = dirname(Cwd::realpath(__FILE__) );
+    $rocmPath = get_rocm_path_option();
+    if ($rocmPath ne '') {
+      # --rocm-path takes precedence over ENV{ROCM_PATH}
+      $ENV{ROCM_PATH}=$rocmPath;
+    }
 }
 use lib "$base_dir/";
-use hipvars;
 
+use hipvars;
 $isWindows      =   $hipvars::isWindows;
 $HIP_RUNTIME    =   $hipvars::HIP_RUNTIME;
 $HIP_PLATFORM   =   $hipvars::HIP_PLATFORM;
@@ -165,9 +184,6 @@ BEGIN
     $HIP_CLANG_TARGET = `$HIPCC -print-target-triple`;
     chomp($HIP_CLANG_TARGET);
 
-    if (! defined $HIP_CLANG_INCLUDE_PATH) {
-        $HIP_CLANG_INCLUDE_PATH = abs_path("$HIP_CLANG_PATH/../lib/clang/$HIP_CLANG_VERSION/include");
-    }
     if (! defined $HIP_INCLUDE_PATH) {
         $HIP_INCLUDE_PATH = "$HIP_PATH/include";
     }
@@ -180,15 +196,12 @@ BEGIN
             print ("HIP_ROCCLR_HOME=$HIP_ROCCLR_HOME\n");
         }
         print ("HIP_CLANG_PATH=$HIP_CLANG_PATH\n");
-        print ("HIP_CLANG_INCLUDE_PATH=$HIP_CLANG_INCLUDE_PATH\n");
         print ("HIP_INCLUDE_PATH=$HIP_INCLUDE_PATH\n");
         print ("HIP_LIB_PATH=$HIP_LIB_PATH\n");
         print ("DEVICE_LIB_PATH=$DEVICE_LIB_PATH\n");
         print ("HIP_CLANG_TARGET=$HIP_CLANG_TARGET\n");
     }
 
-    $HIPCXXFLAGS .= " -isystem \"$HIP_CLANG_INCLUDE_PATH/..\"";
-    $HIPCFLAGS .= " -isystem \"$HIP_CLANG_INCLUDE_PATH/..\"";
     $HIPLDFLAGS .= " -L\"$HIP_LIB_PATH\"";
     if ($isWindows) {
       $HIPLDFLAGS .= " -lamdhip64";
 
@@ -468,40 +468,43 @@ int main()
 
 ## CU_POINTER_ATTRIBUTE_MEMORY_TYPE
 
-To get pointer's memory type in HIP/HIP-Clang, developers should use hipPointerGetAttributes API. First parameter of the API is hipPointerAttribute_t which has 'memoryType' as member variable. 'memoryType' indicates input pointer is allocated on device or host.
+To get pointer's memory type in HIP/HIP-Clang, developers should use hipPointerGetAttributes API. First parameter of the API is hipPointerAttribute_t which has 'type' as member variable. 'type' indicates input pointer is allocated on device or host.
 
 For example:
 ```
 double * ptr;
 hipMalloc(reinterpret_cast<void**>(&ptr), sizeof(double));
 hipPointerAttribute_t attr;
-hipPointerGetAttributes(&attr, ptr); /*attr.memoryType will have value as hipMemoryTypeDevice*/
+hipPointerGetAttributes(&attr, ptr); /*attr.type will have value as hipMemoryTypeDevice*/
 
 double* ptrHost;
 hipHostMalloc(&ptrHost, sizeof(double));
 hipPointerAttribute_t attr;
-hipPointerGetAttributes(&attr, ptrHost); /*attr.memoryType will have value as hipMemoryTypeHost*/
+hipPointerGetAttributes(&attr, ptrHost); /*attr.type will have value as hipMemoryTypeHost*/
 ```
 Please note, hipMemoryType enum values are different from cudaMemoryType enum values.
 
-For example, on AMD platform, memoryType is defined in hip_runtime_api.h,
+For example, on AMD platform, hipMemoryType is defined in hip_runtime_api.h,
+```
 typedef enum hipMemoryType {
-    hipMemoryTypeHost,    ///< Memory is physically located on host
-    hipMemoryTypeDevice,  ///< Memory is physically located on device.
-    hipMemoryTypeArray,  ///< Array memory, physically located on device.
-    hipMemoryTypeUnified  ///< Not used currently
+    hipMemoryTypeHost = 0,    ///< Memory is physically located on host
+    hipMemoryTypeDevice = 1,  ///< Memory is physically located on device. (see deviceId for specific device)
+    hipMemoryTypeArray = 2,   ///< Array memory, physically located on device. (see deviceId for specific device)
+    hipMemoryTypeUnified = 3, ///< Not used currently
+    hipMemoryTypeManaged = 4  ///< Managed memory, automaticallly managed by the unified memory system
 } hipMemoryType;
-
-Looking into CUDA toolkit, it defines memoryType as following,
+```
+Looking into CUDA toolkit, it defines cudaMemoryType as following,
+```
 enum cudaMemoryType
 {
   cudaMemoryTypeUnregistered = 0, // Unregistered memory.
   cudaMemoryTypeHost = 1, // Host memory.
   cudaMemoryTypeDevice = 2, // Device memory.
   cudaMemoryTypeManaged = 3, // Managed memory
 }
-
-In this case, memoryType translation for hipPointerGetAttributes needs to be handled properly on nvidia platform to get the correct memory type in CUDA, which is done in the file nvidia_hip_runtime_api.h.
+```
+In this case, memory type translation for hipPointerGetAttributes needs to be handled properly on nvidia platform to get the correct memory type in CUDA, which is done in the file nvidia_hip_runtime_api.h.
 
 So in any HIP applications which use HIP APIs involving memory types, developers should use #ifdef in order to assign the correct enum values depending on Nvidia or AMD platform.
 
 
@@ -153,24 +153,31 @@ typedef struct hipDeviceProp_t {
 } hipDeviceProp_t;
 
 
-/**
- * Memory type (for pointer attributes)
+ /*
+ * @brief HIP Memory type (for pointer attributes)
+ * @enum
+ * @ingroup Enumerations
  */
 typedef enum hipMemoryType {
-    hipMemoryTypeHost,    ///< Memory is physically located on host
-    hipMemoryTypeDevice,  ///< Memory is physically located on device. (see deviceId for specific
-                          ///< device)
-    hipMemoryTypeArray,   ///< Array memory, physically located on device. (see deviceId for specific
-                          ///< device)
-    hipMemoryTypeUnified, ///< Not used currently
-    hipMemoryTypeManaged  ///< Managed memory, automaticallly managed by the unified memory system
+    hipMemoryTypeHost = 0,    ///< Memory is physically located on host
+    hipMemoryTypeDevice = 1,  ///< Memory is physically located on device. (see deviceId for
+                              ///< specific device)
+    hipMemoryTypeArray = 2,   ///< Array memory, physically located on device. (see deviceId for
+                              ///< specific device)
+    hipMemoryTypeUnified = 3, ///< Not used currently
+    hipMemoryTypeManaged = 4  ///< Managed memory, automaticallly managed by the unified
+                              ///< memory system
 } hipMemoryType;
 
 /**
  * Pointer attributes
  */
 typedef struct hipPointerAttribute_t {
-    enum hipMemoryType memoryType;
+  union {
+      // Deprecated, use instead type
+      enum hipMemoryType memoryType;
+      enum hipMemoryType type;
+    };
     int device;
     void* devicePointer;
     void* hostPointer;
@@ -6768,6 +6775,148 @@ inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
     return hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
         numBlocks, reinterpret_cast<const void*>(f), blockSize, dynSharedMemPerBlk, flags);
 }
+/**
+ * @brief Returns grid and block size that achieves maximum potential occupancy for a device function
+ *
+ * Returns in \p *min_grid_size and \p *block_size a suggested grid /
+ * block size pair that achieves the best potential occupancy
+ * (i.e. the maximum number of active warps on the current device with the smallest number
+ * of blocks for a particular function).
+ *
+ * @param [out] min_grid_size minimum grid size needed to achieve the best potential occupancy
+ * @param [out] block_size    block size required for the best potential occupancy
+ * @param [in]  func          device function symbol
+ * @param [in]  block_size_to_dynamic_smem_size - a unary function/functor that takes block size,
+ * and returns the size, in bytes, of dynamic shared memory needed for a block
+ * @param [in]  block_size_limit the maximum block size \p func is designed to work with. 0 means no limit.
+ * @param [in]  flags         reserved
+ *
+ * @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidDeviceFunction, #hipErrorInvalidValue,
+ * #hipErrorUnknown
+ */
+template<typename UnaryFunction, class T>
+static hipError_t __host__ inline hipOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(
+    int*          min_grid_size,
+    int*          block_size,
+    T             func,
+    UnaryFunction block_size_to_dynamic_smem_size,
+    int           block_size_limit = 0,
+    unsigned int  flags = 0) {
+  if (min_grid_size == nullptr || block_size == nullptr ||
+      reinterpret_cast<const void*>(func) == nullptr) {
+    return hipErrorInvalidValue;
+  }
+
+  int dev;
+  hipError_t status;
+  if ((status = hipGetDevice(&dev)) != hipSuccess) {
+    return status;
+  }
+
+  int max_threads_per_cu;
+  if ((status = hipDeviceGetAttribute(&max_threads_per_cu,
+      hipDeviceAttributeMaxThreadsPerMultiProcessor, dev)) != hipSuccess) {
+    return status;
+  }
+
+  int warp_size;
+  if ((status = hipDeviceGetAttribute(&warp_size,
+      hipDeviceAttributeWarpSize, dev)) != hipSuccess) {
+    return status;
+  }
+
+  int max_cu_count;
+  if ((status = hipDeviceGetAttribute(&max_cu_count,
+      hipDeviceAttributeMultiprocessorCount, dev)) != hipSuccess) {
+    return status;
+  }
+
+  struct hipFuncAttributes attr;
+  if ((status = hipFuncGetAttributes(&attr, reinterpret_cast<const void*>(func))) != hipSuccess) {
+    return status;
+  }
+
+  // Initial limits for the execution
+  const int func_max_threads_per_block = attr.maxThreadsPerBlock;
+  if (block_size_limit == 0) {
+    block_size_limit = func_max_threads_per_block;
+  }
+
+  if (func_max_threads_per_block < block_size_limit) {
+    block_size_limit = func_max_threads_per_block;
+  }
+
+  const int block_size_limit_aligned =
+    ((block_size_limit + (warp_size - 1)) / warp_size) * warp_size;
+
+  // For maximum search
+  int max_threads = 0;
+  int max_block_size{};
+  int max_num_blocks{};
+  for (int block_size_check_aligned = block_size_limit_aligned;
+       block_size_check_aligned > 0;
+       block_size_check_aligned -= warp_size) {
+    // Make sure the logic uses the requested limit and not aligned
+    int block_size_check = (block_size_limit < block_size_check_aligned) ?
+        block_size_limit : block_size_check_aligned;
+
+    size_t dyn_smem_size = block_size_to_dynamic_smem_size(block_size_check);
+    int optimal_blocks;
+    if ((status = hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+        &optimal_blocks, func, block_size_check, dyn_smem_size, flags)) != hipSuccess) {
+      return status;
+    }
+
+    int total_threads = block_size_check * optimal_blocks;
+    if (total_threads > max_threads) {
+      max_block_size = block_size_check;
+      max_num_blocks = optimal_blocks;
+      max_threads = total_threads;
+    }
+
+    // Break if the logic reached possible maximum
+    if (max_threads_per_cu == max_threads) {
+      break;
+    }
+  }
+
+  // Grid size is the number of blocks per CU * CU count
+  *min_grid_size = max_num_blocks * max_cu_count;
+  *block_size = max_block_size;
+
+  return status;
+}
+
+/**
+ * @brief Returns grid and block size that achieves maximum potential occupancy for a device function
+ *
+ * Returns in \p *min_grid_size and \p *block_size a suggested grid /
+ * block size pair that achieves the best potential occupancy
+ * (i.e. the maximum number of active warps on the current device with the smallest number
+ * of blocks for a particular function).
+ *
+ * @param [out] min_grid_size minimum grid size needed to achieve the best potential occupancy
+ * @param [out] block_size    block size required for the best potential occupancy
+ * @param [in]  func          device function symbol
+ * @param [in]  block_size_to_dynamic_smem_size - a unary function/functor that takes block size,
+ * and returns the size, in bytes, of dynamic shared memory needed for a block
+ * @param [in]  block_size_limit the maximum block size \p func is designed to work with. 0 means no limit.
+ *
+ * @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidDeviceFunction, #hipErrorInvalidValue,
+ * #hipErrorUnknown
+ */
+template<typename UnaryFunction, class T>
+static hipError_t __host__ inline hipOccupancyMaxPotentialBlockSizeVariableSMem(
+    int*          min_grid_size,
+    int*          block_size,
+    T             func,
+    UnaryFunction block_size_to_dynamic_smem_size,
+    int           block_size_limit = 0)
+{
+    return hipOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(min_grid_size, block_size, func,
+      block_size_to_dynamic_smem_size, block_size_limit);
+}
+
 template <typename F>
 inline hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
                                                     F kernel, size_t dynSharedMemPerBlk, uint32_t blockSizeLimit) {
 
@@ -29,6 +29,7 @@ TEMPLATE_TEST_CASE("ABM_AddKernel_MultiTypeMultiSize", "", int, long, float, lon
   REQUIRE(res == hipSuccess);
 
   hipLaunchKernelGGL(add<TestType>, 1, size, 0, 0, d_a, d_b, d_c, size);
+  HIP_CHECK(hipGetLastError());
 
   res = hipMemcpy(a.data(), d_c, sizeof(TestType) * size, hipMemcpyDeviceToHost);
   REQUIRE(res == hipSuccess);
 
@@ -53,6 +53,7 @@ TEST_CASE("Unit_hipManagedKeyword_SingleGpu") {
   hipLaunchKernelGGL(add, dimGrid, dimBlock, 0, 0, static_cast<const float*>(A),
                      static_cast<float*>(B));
 
+  HIP_CHECK(hipGetLastError());
   HIP_CHECK(hipDeviceSynchronize());
 
   float maxError = 0.0f;
Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,7 @@ my $isWindows = ($^O eq 'MSWin32' or $^O eq 'msys');`
`35`	`35`	`# escapes args with quotes SWDEV-341955`
`36`	`36`	`foreach $arg (@ARGV) {`
`37`	`37`	`if ($isWindows) {`
`38`		`- $arg =~ s/[^-a-zA-Z0-9_=+,.:\/\\]/\\$&/g;`
	`38`	`+ $arg =~ s/[^-a-zA-Z0-9_=+,.:\/\\ ]/\\$&/g;`
`39`	`39`	`}`
`40`	`40`	`}`
`41`	`41`