Skip to content

Commit fd55f66

Browse files
committed
Merge branch 'main' into peter/werror
2 parents 77b4c1a + c311fe8 commit fd55f66

19 files changed

+537
-23
lines changed

include/ur_api.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4037,6 +4037,9 @@ urProgramCreateWithIL(
40374037
///
40384038
/// @details
40394039
/// - The application may call this function from simultaneous threads.
4040+
/// - Following a successful call to this entry point, `phProgram` will
4041+
/// contain a binary of type ::UR_PROGRAM_BINARY_TYPE_COMPILED_OBJECT or
4042+
/// ::UR_PROGRAM_BINARY_TYPE_LIBRARY for `hDevice`.
40404043
///
40414044
/// @remarks
40424045
/// _Analogues_

scripts/core/program.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ analogue:
127127
- "**clCreateProgramWithBinary**"
128128
details:
129129
- "The application may call this function from simultaneous threads."
130+
- "Following a successful call to this entry point, `phProgram` will contain a binary of type $X_PROGRAM_BINARY_TYPE_COMPILED_OBJECT or $X_PROGRAM_BINARY_TYPE_LIBRARY for `hDevice`."
130131
params:
131132
- type: $x_context_handle_t
132133
name: hContext

source/adapters/cuda/command_buffer.cpp

Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,91 @@ static void setCopyParams(const void *SrcPtr, const CUmemorytype_enum SrcType,
9999
Params.Depth = 1;
100100
}
101101

102+
// Helper function for enqueuing memory fills
103+
static ur_result_t enqueueCommandBufferFillHelper(
104+
ur_exp_command_buffer_handle_t CommandBuffer, void *DstDevice,
105+
const CUmemorytype_enum DstType, const void *Pattern, size_t PatternSize,
106+
size_t Size, uint32_t NumSyncPointsInWaitList,
107+
const ur_exp_command_buffer_sync_point_t *SyncPointWaitList,
108+
ur_exp_command_buffer_sync_point_t *SyncPoint) {
109+
ur_result_t Result = UR_RESULT_SUCCESS;
110+
std::vector<CUgraphNode> DepsList;
111+
UR_CALL(getNodesFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList,
112+
SyncPointWaitList, DepsList),
113+
Result);
114+
115+
try {
116+
const size_t N = Size / PatternSize;
117+
auto Value = *static_cast<const uint32_t *>(Pattern);
118+
auto DstPtr = DstType == CU_MEMORYTYPE_DEVICE
119+
? *static_cast<CUdeviceptr *>(DstDevice)
120+
: (CUdeviceptr)DstDevice;
121+
122+
if ((PatternSize == 1) || (PatternSize == 2) || (PatternSize == 4)) {
123+
// Create a new node
124+
CUgraphNode GraphNode;
125+
CUDA_MEMSET_NODE_PARAMS NodeParams = {};
126+
NodeParams.dst = DstPtr;
127+
NodeParams.elementSize = PatternSize;
128+
NodeParams.height = N;
129+
NodeParams.pitch = PatternSize;
130+
NodeParams.value = Value;
131+
NodeParams.width = 1;
132+
133+
UR_CHECK_ERROR(cuGraphAddMemsetNode(
134+
&GraphNode, CommandBuffer->CudaGraph, DepsList.data(),
135+
DepsList.size(), &NodeParams, CommandBuffer->Device->getContext()));
136+
137+
// Get sync point and register the cuNode with it.
138+
*SyncPoint =
139+
CommandBuffer->AddSyncPoint(std::make_shared<CUgraphNode>(GraphNode));
140+
141+
} else {
142+
// CUDA has no memset functions that allow setting values more than 4
143+
// bytes. UR API lets you pass an arbitrary "pattern" to the buffer
144+
// fill, which can be more than 4 bytes. We must break up the pattern
145+
// into 4 byte values, and set the buffer using multiple strided calls.
146+
// This means that one cuGraphAddMemsetNode call is made for every 4 bytes
147+
// in the pattern.
148+
149+
size_t NumberOfSteps = PatternSize / sizeof(uint32_t);
150+
151+
// we walk up the pattern in 4-byte steps, and call cuMemset for each
152+
// 4-byte chunk of the pattern.
153+
for (auto Step = 0u; Step < NumberOfSteps; ++Step) {
154+
// take 4 bytes of the pattern
155+
auto Value = *(static_cast<const uint32_t *>(Pattern) + Step);
156+
157+
// offset the pointer to the part of the buffer we want to write to
158+
auto OffsetPtr = DstPtr + (Step * sizeof(uint32_t));
159+
160+
// Create a new node
161+
CUgraphNode GraphNode;
162+
// Update NodeParam
163+
CUDA_MEMSET_NODE_PARAMS NodeParamsStep = {};
164+
NodeParamsStep.dst = (CUdeviceptr)OffsetPtr;
165+
NodeParamsStep.elementSize = 4;
166+
NodeParamsStep.height = N;
167+
NodeParamsStep.pitch = PatternSize;
168+
NodeParamsStep.value = Value;
169+
NodeParamsStep.width = 1;
170+
171+
UR_CHECK_ERROR(cuGraphAddMemsetNode(
172+
&GraphNode, CommandBuffer->CudaGraph, DepsList.data(),
173+
DepsList.size(), &NodeParamsStep,
174+
CommandBuffer->Device->getContext()));
175+
176+
// Get sync point and register the cuNode with it.
177+
*SyncPoint = CommandBuffer->AddSyncPoint(
178+
std::make_shared<CUgraphNode>(GraphNode));
179+
}
180+
}
181+
} catch (ur_result_t Err) {
182+
Result = Err;
183+
}
184+
return Result;
185+
}
186+
102187
UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp(
103188
ur_context_handle_t hContext, ur_device_handle_t hDevice,
104189
const ur_exp_command_buffer_desc_t *pCommandBufferDesc,
@@ -525,6 +610,119 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp(
525610
return Result;
526611
}
527612

613+
UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp(
614+
ur_exp_command_buffer_handle_t hCommandBuffer, const void * /* Mem */,
615+
size_t /*Size*/, ur_usm_migration_flags_t /*Flags*/,
616+
uint32_t numSyncPointsInWaitList,
617+
const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
618+
ur_exp_command_buffer_sync_point_t *pSyncPoint) {
619+
// Prefetch cmd is not supported by Cuda Graph.
620+
// We implement it as an empty node to enforce dependencies.
621+
ur_result_t Result = UR_RESULT_SUCCESS;
622+
CUgraphNode GraphNode;
623+
624+
std::vector<CUgraphNode> DepsList;
625+
UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
626+
pSyncPointWaitList, DepsList),
627+
Result);
628+
629+
try {
630+
// Add an empty node to preserve dependencies.
631+
UR_CHECK_ERROR(cuGraphAddEmptyNode(&GraphNode, hCommandBuffer->CudaGraph,
632+
DepsList.data(), DepsList.size()));
633+
634+
// Get sync point and register the cuNode with it.
635+
*pSyncPoint =
636+
hCommandBuffer->AddSyncPoint(std::make_shared<CUgraphNode>(GraphNode));
637+
638+
setErrorMessage("Prefetch hint ignored and replaced with empty node as "
639+
"prefetch is not supported by CUDA Graph backend",
640+
UR_RESULT_SUCCESS);
641+
Result = UR_RESULT_ERROR_ADAPTER_SPECIFIC;
642+
} catch (ur_result_t Err) {
643+
Result = Err;
644+
}
645+
return Result;
646+
}
647+
648+
UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp(
649+
ur_exp_command_buffer_handle_t hCommandBuffer, const void * /* Mem */,
650+
size_t /*Size*/, ur_usm_advice_flags_t /*Advice*/,
651+
uint32_t numSyncPointsInWaitList,
652+
const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
653+
ur_exp_command_buffer_sync_point_t *pSyncPoint) {
654+
// Mem-Advise cmd is not supported by Cuda Graph.
655+
// We implement it as an empty node to enforce dependencies.
656+
ur_result_t Result = UR_RESULT_SUCCESS;
657+
CUgraphNode GraphNode;
658+
659+
std::vector<CUgraphNode> DepsList;
660+
UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
661+
pSyncPointWaitList, DepsList),
662+
Result);
663+
664+
try {
665+
// Add an empty node to preserve dependencies.
666+
UR_CHECK_ERROR(cuGraphAddEmptyNode(&GraphNode, hCommandBuffer->CudaGraph,
667+
DepsList.data(), DepsList.size()));
668+
669+
// Get sync point and register the cuNode with it.
670+
*pSyncPoint =
671+
hCommandBuffer->AddSyncPoint(std::make_shared<CUgraphNode>(GraphNode));
672+
673+
setErrorMessage("Memory advice ignored and replaced with empty node as "
674+
"memory advice is not supported by CUDA Graph backend",
675+
UR_RESULT_SUCCESS);
676+
Result = UR_RESULT_ERROR_ADAPTER_SPECIFIC;
677+
} catch (ur_result_t Err) {
678+
Result = Err;
679+
}
680+
681+
return Result;
682+
}
683+
684+
UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp(
685+
ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer,
686+
const void *pPattern, size_t patternSize, size_t offset, size_t size,
687+
uint32_t numSyncPointsInWaitList,
688+
const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
689+
ur_exp_command_buffer_sync_point_t *pSyncPoint) {
690+
auto ArgsAreMultiplesOfPatternSize =
691+
(offset % patternSize == 0) || (size % patternSize == 0);
692+
693+
auto PatternIsValid = (pPattern != nullptr);
694+
695+
auto PatternSizeIsValid = ((patternSize & (patternSize - 1)) == 0) &&
696+
(patternSize > 0); // is a positive power of two
697+
UR_ASSERT(ArgsAreMultiplesOfPatternSize && PatternIsValid &&
698+
PatternSizeIsValid,
699+
UR_RESULT_ERROR_INVALID_SIZE);
700+
701+
auto DstDevice = std::get<BufferMem>(hBuffer->Mem).get() + offset;
702+
703+
return enqueueCommandBufferFillHelper(
704+
hCommandBuffer, &DstDevice, CU_MEMORYTYPE_DEVICE, pPattern, patternSize,
705+
size, numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint);
706+
}
707+
708+
UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp(
709+
ur_exp_command_buffer_handle_t hCommandBuffer, void *pPtr,
710+
const void *pPattern, size_t patternSize, size_t size,
711+
uint32_t numSyncPointsInWaitList,
712+
const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
713+
ur_exp_command_buffer_sync_point_t *pSyncPoint) {
714+
715+
auto PatternIsValid = (pPattern != nullptr);
716+
717+
auto PatternSizeIsValid = ((patternSize & (patternSize - 1)) == 0) &&
718+
(patternSize > 0); // is a positive power of two
719+
720+
UR_ASSERT(PatternIsValid && PatternSizeIsValid, UR_RESULT_ERROR_INVALID_SIZE);
721+
return enqueueCommandBufferFillHelper(
722+
hCommandBuffer, pPtr, CU_MEMORYTYPE_UNIFIED, pPattern, patternSize, size,
723+
numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint);
724+
}
725+
528726
UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
529727
ur_exp_command_buffer_handle_t hCommandBuffer, ur_queue_handle_t hQueue,
530728
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,

source/adapters/cuda/ur_interface_loader.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetCommandBufferExpProcAddrTable(
279279
pDdiTable->pfnFinalizeExp = urCommandBufferFinalizeExp;
280280
pDdiTable->pfnAppendKernelLaunchExp = urCommandBufferAppendKernelLaunchExp;
281281
pDdiTable->pfnAppendUSMMemcpyExp = urCommandBufferAppendUSMMemcpyExp;
282+
pDdiTable->pfnAppendUSMFillExp = urCommandBufferAppendUSMFillExp;
282283
pDdiTable->pfnAppendMemBufferCopyExp = urCommandBufferAppendMemBufferCopyExp;
283284
pDdiTable->pfnAppendMemBufferCopyRectExp =
284285
urCommandBufferAppendMemBufferCopyRectExp;
@@ -289,6 +290,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetCommandBufferExpProcAddrTable(
289290
urCommandBufferAppendMemBufferWriteExp;
290291
pDdiTable->pfnAppendMemBufferWriteRectExp =
291292
urCommandBufferAppendMemBufferWriteRectExp;
293+
pDdiTable->pfnAppendUSMPrefetchExp = urCommandBufferAppendUSMPrefetchExp;
294+
pDdiTable->pfnAppendUSMAdviseExp = urCommandBufferAppendUSMAdviseExp;
295+
pDdiTable->pfnAppendMemBufferFillExp = urCommandBufferAppendMemBufferFillExp;
292296
pDdiTable->pfnEnqueueExp = urCommandBufferEnqueueExp;
293297

294298
return retVal;

source/adapters/hip/CMakeLists.txt

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,15 +101,30 @@ if("${UR_HIP_PLATFORM}" STREQUAL "AMD")
101101
)
102102

103103
if(UR_ENABLE_COMGR)
104+
set(UR_COMGR_VERSION5_HEADER "${UR_HIP_INCLUDE_DIR}/amd_comgr/amd_comgr.h")
105+
set(UR_COMGR_VERSION4_HEADER "${UR_HIP_INCLUDE_DIR}/amd_comgr.h")
106+
# The COMGR header changed location between ROCm versions 4 and 5.
107+
# Check for existence in the version 5 location or fallback to version 4
108+
if(NOT EXISTS "${UR_COMGR_VERSION5_HEADER}")
109+
if(NOT EXISTS "${UR_COMGR_VERSION4_HEADER}")
110+
message(FATAL_ERROR "Could not find AMD COMGR header at "
111+
"${UR_COMGR_VERSION5_HEADER} or"
112+
"${UR_COMGR_VERSION4_HEADER}, "
113+
"check ROCm installation")
114+
else()
115+
target_compile_definitions(${TARGET_NAME} PRIVATE UR_COMGR_VERSION4_INCLUDE)
116+
endif()
117+
endif()
118+
104119
add_library(amd_comgr SHARED IMPORTED GLOBAL)
105120
set_target_properties(
106121
amd_comgr PROPERTIES
107122
IMPORTED_LOCATION "${UR_HIP_LIB_DIR}/libamd_comgr.so"
108123
INTERFACE_INCLUDE_DIRECTORIES "${HIP_HEADERS}"
109124
INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HIP_HEADERS}"
110125
)
111-
target_link_libraries(pi_hip PUBLIC amd_comgr)
112-
target_compile_definitions(pi_hip PRIVATE SYCL_ENABLE_KERNEL_FUSION)
126+
target_link_libraries(${TARGET_NAME} PUBLIC amd_comgr)
127+
target_compile_definitions(${TARGET_NAME} PRIVATE SYCL_ENABLE_KERNEL_FUSION)
113128
endif(UR_ENABLE_COMGR)
114129

115130
target_link_libraries(${TARGET_NAME} PRIVATE

source/adapters/hip/command_buffer.cpp

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,39 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp(
122122
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
123123
}
124124

125+
UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp(
126+
ur_exp_command_buffer_handle_t, const void *, size_t,
127+
ur_usm_migration_flags_t, uint32_t,
128+
const ur_exp_command_buffer_sync_point_t *,
129+
ur_exp_command_buffer_sync_point_t *) {
130+
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
131+
}
132+
133+
UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp(
134+
ur_exp_command_buffer_handle_t, const void *, size_t, ur_usm_advice_flags_t,
135+
uint32_t, const ur_exp_command_buffer_sync_point_t *,
136+
ur_exp_command_buffer_sync_point_t *) {
137+
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
138+
}
139+
140+
UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp(
141+
ur_exp_command_buffer_handle_t, ur_mem_handle_t, const void *, size_t,
142+
size_t, size_t, uint32_t, const ur_exp_command_buffer_sync_point_t *,
143+
ur_exp_command_buffer_sync_point_t *) {
144+
detail::ur::die("Experimental Command-buffer feature is not "
145+
"implemented for HIP adapter.");
146+
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
147+
}
148+
149+
UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp(
150+
ur_exp_command_buffer_handle_t, void *, const void *, size_t, size_t,
151+
uint32_t, const ur_exp_command_buffer_sync_point_t *,
152+
ur_exp_command_buffer_sync_point_t *) {
153+
detail::ur::die("Experimental Command-buffer feature is not "
154+
"implemented for HIP adapter.");
155+
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
156+
}
157+
125158
UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
126159
ur_exp_command_buffer_handle_t, ur_queue_handle_t, uint32_t,
127160
const ur_event_handle_t *, ur_event_handle_t *) {

source/adapters/hip/common.hpp

Lines changed: 29 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,29 +10,48 @@
1010
#pragma once
1111

1212
#ifdef SYCL_ENABLE_KERNEL_FUSION
13+
#ifdef UR_COMGR_VERSION4_INCLUDE
14+
#include <amd_comgr.h>
15+
#else
1316
#include <amd_comgr/amd_comgr.h>
1417
#endif
18+
#endif
1519
#include <hip/hip_runtime.h>
1620
#include <ur/ur.hpp>
1721

18-
// Hipify doesn't support cuArrayGetDescriptor, on AMD the hipArray can just be
19-
// indexed, but on NVidia it is an opaque type and needs to go through
20-
// cuArrayGetDescriptor so implement a utility function to get the array
21-
// properties
22-
inline void getArrayDesc(hipArray *Array, hipArray_Format &Format,
23-
size_t &Channels) {
22+
// Before ROCm 6, hipify doesn't support cuArrayGetDescriptor, on AMD the
23+
// hipArray can just be indexed, but on NVidia it is an opaque type and needs to
24+
// go through cuArrayGetDescriptor so implement a utility function to get the
25+
// array properties
26+
inline static hipError_t getArrayDesc(hipArray *Array, hipArray_Format &Format,
27+
size_t &Channels) {
28+
#if HIP_VERSION_MAJOR >= 6
29+
HIP_ARRAY_DESCRIPTOR ArrayDesc;
30+
hipError_t err = hipArrayGetDescriptor(&ArrayDesc, Array);
31+
if (err == hipSuccess) {
32+
Format = ArrayDesc.Format;
33+
Channels = ArrayDesc.NumChannels;
34+
}
35+
return err;
36+
#else
2437
#if defined(__HIP_PLATFORM_AMD__)
2538
Format = Array->Format;
2639
Channels = Array->NumChannels;
40+
return hipSuccess;
2741
#elif defined(__HIP_PLATFORM_NVIDIA__)
2842
CUDA_ARRAY_DESCRIPTOR ArrayDesc;
29-
cuArrayGetDescriptor(&ArrayDesc, (CUarray)Array);
30-
31-
Format = ArrayDesc.Format;
32-
Channels = ArrayDesc.NumChannels;
43+
CUresult err = cuArrayGetDescriptor(&ArrayDesc, (CUarray)Array);
44+
if (err == CUDA_SUCCESS) {
45+
Format = ArrayDesc.Format;
46+
Channels = ArrayDesc.NumChannels;
47+
return hipSuccess;
48+
} else {
49+
return hipErrorUnknown; // No easy way to map CUerror to hipError
50+
}
3351
#else
3452
#error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
3553
#endif
54+
#endif
3655
}
3756

3857
// HIP on NVIDIA headers guard hipArray3DCreate behind __CUDACC__, this does not

0 commit comments

Comments
 (0)