@@ -75,30 +75,46 @@ ur_result_t setHipMemAdvise(const void *DevPtr, const size_t Size,
7575 if (URAdviceFlags &
7676 (UR_USM_ADVICE_FLAG_SET_NON_ATOMIC_MOSTLY |
7777 UR_USM_ADVICE_FLAG_CLEAR_NON_ATOMIC_MOSTLY |
78- UR_USM_ADVICE_FLAG_BIAS_CACHED | UR_USM_ADVICE_FLAG_BIAS_UNCACHED)) {
78+ UR_USM_ADVICE_FLAG_BIAS_CACHED | UR_USM_ADVICE_FLAG_BIAS_UNCACHED
79+ #if !defined(__HIP_PLATFORM_AMD__)
80+ | UR_USM_ADVICE_FLAG_SET_NON_COHERENT_MEMORY |
81+ UR_USM_ADVICE_FLAG_CLEAR_NON_COHERENT_MEMORY
82+ #endif
83+ )) {
7984 return UR_RESULT_ERROR_INVALID_ENUMERATION;
8085 }
8186
8287 using ur_to_hip_advice_t = std::pair<ur_usm_advice_flags_t , hipMemoryAdvise>;
8388
84- static constexpr std::array<ur_to_hip_advice_t , 6 >
85- URToHIPMemAdviseDeviceFlags{
86- std::make_pair (UR_USM_ADVICE_FLAG_SET_READ_MOSTLY,
87- hipMemAdviseSetReadMostly),
88- std::make_pair (UR_USM_ADVICE_FLAG_CLEAR_READ_MOSTLY,
89- hipMemAdviseUnsetReadMostly),
90- std::make_pair (UR_USM_ADVICE_FLAG_SET_PREFERRED_LOCATION,
91- hipMemAdviseSetPreferredLocation),
92- std::make_pair (UR_USM_ADVICE_FLAG_CLEAR_PREFERRED_LOCATION,
93- hipMemAdviseUnsetPreferredLocation),
94- std::make_pair (UR_USM_ADVICE_FLAG_SET_ACCESSED_BY_DEVICE,
95- hipMemAdviseSetAccessedBy),
96- std::make_pair (UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_DEVICE,
97- hipMemAdviseUnsetAccessedBy),
98- };
99- for (auto &FlagPair : URToHIPMemAdviseDeviceFlags) {
100- if (URAdviceFlags & FlagPair.first ) {
101- UR_CHECK_ERROR (hipMemAdvise (DevPtr, Size, FlagPair.second , Device));
89+ #if defined(__HIP_PLATFORM_AMD__)
90+ constexpr size_t DeviceFlagCount = 8 ;
91+ #else
92+ constexpr size_t DeviceFlagCount = 6 ;
93+ #endif
94+ static constexpr std::array<ur_to_hip_advice_t , DeviceFlagCount>
95+ URToHIPMemAdviseDeviceFlags {
96+ std::make_pair (UR_USM_ADVICE_FLAG_SET_READ_MOSTLY,
97+ hipMemAdviseSetReadMostly),
98+ std::make_pair (UR_USM_ADVICE_FLAG_CLEAR_READ_MOSTLY,
99+ hipMemAdviseUnsetReadMostly),
100+ std::make_pair (UR_USM_ADVICE_FLAG_SET_PREFERRED_LOCATION,
101+ hipMemAdviseSetPreferredLocation),
102+ std::make_pair (UR_USM_ADVICE_FLAG_CLEAR_PREFERRED_LOCATION,
103+ hipMemAdviseUnsetPreferredLocation),
104+ std::make_pair (UR_USM_ADVICE_FLAG_SET_ACCESSED_BY_DEVICE,
105+ hipMemAdviseSetAccessedBy),
106+ std::make_pair (UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_DEVICE,
107+ hipMemAdviseUnsetAccessedBy),
108+ #if defined(__HIP_PLATFORM_AMD__)
109+ std::make_pair (UR_USM_ADVICE_FLAG_SET_NON_COHERENT_MEMORY,
110+ hipMemAdviseSetCoarseGrain),
111+ std::make_pair (UR_USM_ADVICE_FLAG_CLEAR_NON_COHERENT_MEMORY,
112+ hipMemAdviseUnsetCoarseGrain),
113+ #endif
114+ };
115+ for (const auto &[URAdvice, HIPAdvice] : URToHIPMemAdviseDeviceFlags) {
116+ if (URAdviceFlags & URAdvice) {
117+ UR_CHECK_ERROR (hipMemAdvise (DevPtr, Size, HIPAdvice, Device));
102118 }
103119 }
104120
@@ -113,10 +129,9 @@ ur_result_t setHipMemAdvise(const void *DevPtr, const size_t Size,
113129 hipMemAdviseUnsetAccessedBy),
114130 };
115131
116- for (auto &FlagPair : URToHIPMemAdviseHostFlags) {
117- if (URAdviceFlags & FlagPair.first ) {
118- UR_CHECK_ERROR (
119- hipMemAdvise (DevPtr, Size, FlagPair.second , hipCpuDeviceId));
132+ for (const auto &[URAdvice, HIPAdvice] : URToHIPMemAdviseHostFlags) {
133+ if (URAdviceFlags & URAdvice) {
134+ UR_CHECK_ERROR (hipMemAdvise (DevPtr, Size, HIPAdvice, hipCpuDeviceId));
120135 }
121136 }
122137
@@ -300,15 +315,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
300315 bool ProvidedLocalWorkGroupSize = (pLocalWorkSize != nullptr );
301316
302317 {
303- ur_result_t Result = urDeviceGetInfo (
304- hQueue->Device , UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES,
305- sizeof (MaxThreadsPerBlock), MaxThreadsPerBlock, nullptr );
306- UR_ASSERT (Result == UR_RESULT_SUCCESS, Result);
318+ MaxThreadsPerBlock[0 ] = hQueue->Device ->getMaxBlockDimX ();
319+ MaxThreadsPerBlock[1 ] = hQueue->Device ->getMaxBlockDimY ();
320+ MaxThreadsPerBlock[2 ] = hQueue->Device ->getMaxBlockDimZ ();
307321
308- Result =
309- urDeviceGetInfo (hQueue->Device , UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE,
310- sizeof (MaxWorkGroupSize), &MaxWorkGroupSize, nullptr );
311- UR_ASSERT (Result == UR_RESULT_SUCCESS, Result);
322+ MaxWorkGroupSize = hQueue->Device ->getMaxWorkGroupSize ();
312323
313324 // The MaxWorkGroupSize = 1024 for AMD GPU
314325 // The MaxThreadsPerBlock = {1024, 1024, 1024}
@@ -423,11 +434,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
423434 : (LocalMemSzPtrPI ? LocalMemSzPtrPI : nullptr );
424435
425436 if (LocalMemSzPtr) {
426- int DeviceMaxLocalMem = 0 ;
427- UR_CHECK_ERROR (hipDeviceGetAttribute (
428- &DeviceMaxLocalMem, hipDeviceAttributeMaxSharedMemoryPerBlock,
429- Dev->get ()));
430-
437+ int DeviceMaxLocalMem = Dev->getDeviceMaxLocalMem ();
431438 static const int EnvVal = std::atoi (LocalMemSzPtr);
432439 if (EnvVal <= 0 || EnvVal > DeviceMaxLocalMem) {
433440 setErrorMessage (LocalMemSzPtrUR ? " Invalid value specified for "
@@ -1484,7 +1491,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
14841491
14851492 // If the device does not support managed memory access, we can't set
14861493 // mem_advise.
1487- if (!getAttribute ( Device, hipDeviceAttributeManagedMemory )) {
1494+ if (!Device-> getManagedMemSupport ( )) {
14881495 releaseEvent ();
14891496 setErrorMessage (" mem_advise ignored as device does not support "
14901497 " managed memory access" ,
@@ -1558,7 +1565,7 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
15581565
15591566 // If the device does not support managed memory access, we can't set
15601567 // mem_advise.
1561- if (!getAttribute ( Device, hipDeviceAttributeManagedMemory )) {
1568+ if (!Device-> getManagedMemSupport ( )) {
15621569 releaseEvent ();
15631570 setErrorMessage (" mem_advise ignored as device does not support "
15641571 " managed memory access" ,
@@ -1575,7 +1582,7 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
15751582 UR_USM_ADVICE_FLAG_SET_ACCESSED_BY_DEVICE |
15761583 UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_DEVICE |
15771584 UR_USM_ADVICE_FLAG_DEFAULT)) {
1578- if (!getAttribute ( Device, hipDeviceAttributeConcurrentManagedAccess )) {
1585+ if (!Device-> getConcurrentManagedAccess ( )) {
15791586 releaseEvent ();
15801587 setErrorMessage (" mem_advise ignored as device does not support "
15811588 " concurrent managed access" ,
@@ -1598,6 +1605,10 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
15981605 pMem, size, hipMemAdviseUnsetPreferredLocation, DeviceID));
15991606 UR_CHECK_ERROR (
16001607 hipMemAdvise (pMem, size, hipMemAdviseUnsetAccessedBy, DeviceID));
1608+ #if defined(__HIP_PLATFORM_AMD__)
1609+ UR_CHECK_ERROR (
1610+ hipMemAdvise (pMem, size, hipMemAdviseUnsetCoarseGrain, DeviceID));
1611+ #endif
16011612 } else {
16021613 Result = setHipMemAdvise (HIPDevicePtr, size, advice, DeviceID);
16031614 // UR_RESULT_ERROR_INVALID_ENUMERATION is returned when using a valid but
@@ -1663,8 +1674,57 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D(
16631674 UR_CHECK_ERROR (RetImplEvent->start ());
16641675 }
16651676
1677+ // There is an issue with hipMemcpy2D* when hipMemcpyDefault is used, which
1678+ // makes the HIP runtime not correctly derive the copy kind (direction) for
1679+ // the copies since ROCm 5.6.0+. See: https://github.com/ROCm/clr/issues/40
1680+ // TODO: Add maximum HIP_VERSION when bug has been fixed.
1681+ #if HIP_VERSION >= 50600000
1682+ hipPointerAttribute_t srcAttribs{};
1683+ hipPointerAttribute_t dstAttribs{};
1684+
1685+ bool srcIsSystemAlloc{false };
1686+ bool dstIsSystemAlloc{false };
1687+
1688+ hipError_t hipRes{};
1689+ // hipErrorInvalidValue returned from hipPointerGetAttributes for a non-null
1690+ // pointer refers to an OS-allocation, hence pageable host memory. However,
1691+ // this means we cannot rely on the attributes result, hence we mark system
1692+ // pageable memory allocation manually as host memory. The HIP runtime can
1693+ // handle the registering/unregistering of the memory as long as the right
1694+ // copy-kind (direction) is provided to hipMemcpy2DAsync for this case.
1695+ hipRes = hipPointerGetAttributes (&srcAttribs, (const void *)pSrc);
1696+ if (hipRes == hipErrorInvalidValue && pSrc)
1697+ srcIsSystemAlloc = true ;
1698+ hipRes = hipPointerGetAttributes (&dstAttribs, (const void *)pDst);
1699+ if (hipRes == hipErrorInvalidValue && pDst)
1700+ dstIsSystemAlloc = true ;
1701+
1702+ const unsigned int srcMemType{srcAttribs.type };
1703+ const unsigned int dstMemType{dstAttribs.type };
1704+
1705+ const bool srcIsHost{(srcMemType == hipMemoryTypeHost) || srcIsSystemAlloc};
1706+ const bool srcIsDevice{srcMemType == hipMemoryTypeDevice};
1707+ const bool dstIsHost{(dstMemType == hipMemoryTypeHost) || dstIsSystemAlloc};
1708+ const bool dstIsDevice{dstMemType == hipMemoryTypeDevice};
1709+
1710+ unsigned int cpyKind{};
1711+ if (srcIsHost && dstIsHost)
1712+ cpyKind = hipMemcpyHostToHost;
1713+ else if (srcIsHost && dstIsDevice)
1714+ cpyKind = hipMemcpyHostToDevice;
1715+ else if (srcIsDevice && dstIsHost)
1716+ cpyKind = hipMemcpyDeviceToHost;
1717+ else if (srcIsDevice && dstIsDevice)
1718+ cpyKind = hipMemcpyDeviceToDevice;
1719+ else
1720+ cpyKind = hipMemcpyDefault;
1721+
1722+ UR_CHECK_ERROR (hipMemcpy2DAsync (pDst, dstPitch, pSrc, srcPitch, width,
1723+ height, (hipMemcpyKind)cpyKind, HIPStream));
1724+ #else
16661725 UR_CHECK_ERROR (hipMemcpy2DAsync (pDst, dstPitch, pSrc, srcPitch, width,
16671726 height, hipMemcpyDefault, HIPStream));
1727+ #endif
16681728
16691729 if (phEvent) {
16701730 UR_CHECK_ERROR (RetImplEvent->record ());
0 commit comments