Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 17 additions & 21 deletions source/adapters/cuda/command_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -369,14 +369,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
UR_RESULT_ERROR_INVALID_VALUE);
}

CUgraphNode GraphNode;
try {
CUgraphNode GraphNode;

std::vector<CUgraphNode> DepsList;
UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
pSyncPointWaitList, DepsList));
std::vector<CUgraphNode> DepsList;
UR_CHECK_ERROR(getNodesFromSyncPoints(
hCommandBuffer, numSyncPointsInWaitList, pSyncPointWaitList, DepsList));

if (*pGlobalWorkSize == 0) {
try {
if (*pGlobalWorkSize == 0) {
// Create an empty node if the kernel workload size is zero
UR_CHECK_ERROR(cuGraphAddEmptyNode(&GraphNode, hCommandBuffer->CudaGraph,
DepsList.data(), DepsList.size()));
Expand All @@ -386,25 +386,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
if (pSyncPoint) {
*pSyncPoint = SyncPoint;
}
} catch (ur_result_t Err) {
return Err;
return UR_RESULT_SUCCESS;
}
return UR_RESULT_SUCCESS;
}

// Set the number of threads per block to the number of threads per warp
// by default unless user has provided a better number
size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
size_t BlocksPerGrid[3] = {1u, 1u, 1u};
// Set the number of threads per block to the number of threads per warp
// by default unless user has provided a better number
size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
size_t BlocksPerGrid[3] = {1u, 1u, 1u};

uint32_t LocalSize = hKernel->getLocalSize();
CUfunction CuFunc = hKernel->get();
UR_CHECK_ERROR(
setKernelParams(hCommandBuffer->Context, hCommandBuffer->Device, workDim,
pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
hKernel, CuFunc, ThreadsPerBlock, BlocksPerGrid));
uint32_t LocalSize = hKernel->getLocalSize();
CUfunction CuFunc = hKernel->get();
UR_CHECK_ERROR(setKernelParams(
hCommandBuffer->Context, hCommandBuffer->Device, workDim,
pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, hKernel, CuFunc,
ThreadsPerBlock, BlocksPerGrid));

try {
// Set node param structure with the kernel related data
auto &ArgIndices = hKernel->getArgIndices();
CUDA_KERNEL_NODE_PARAMS NodeParams = {};
Expand Down
36 changes: 16 additions & 20 deletions source/adapters/hip/command_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -339,14 +339,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
UR_RESULT_ERROR_INVALID_VALUE);
}

hipGraphNode_t GraphNode;
std::vector<hipGraphNode_t> DepsList;
try {
hipGraphNode_t GraphNode;
std::vector<hipGraphNode_t> DepsList;

UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
pSyncPointWaitList, DepsList));
UR_CHECK_ERROR(getNodesFromSyncPoints(
hCommandBuffer, numSyncPointsInWaitList, pSyncPointWaitList, DepsList));

if (*pGlobalWorkSize == 0) {
try {
if (*pGlobalWorkSize == 0) {
// Create an empty node if the kernel workload size is zero
UR_CHECK_ERROR(hipGraphAddEmptyNode(&GraphNode, hCommandBuffer->HIPGraph,
DepsList.data(), DepsList.size()));
Expand All @@ -356,24 +356,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
if (pSyncPoint) {
*pSyncPoint = SyncPoint;
}
} catch (ur_result_t Err) {
return Err;
return UR_RESULT_SUCCESS;
}
return UR_RESULT_SUCCESS;
}

// Set the number of threads per block to the number of threads per warp
// by default unless user has provided a better number
size_t ThreadsPerBlock[3] = {64u, 1u, 1u};
size_t BlocksPerGrid[3] = {1u, 1u, 1u};
// Set the number of threads per block to the number of threads per warp
// by default unless user has provided a better number
size_t ThreadsPerBlock[3] = {64u, 1u, 1u};
size_t BlocksPerGrid[3] = {1u, 1u, 1u};

uint32_t LocalSize = hKernel->getLocalSize();
hipFunction_t HIPFunc = hKernel->get();
UR_CHECK_ERROR(setKernelParams(
hCommandBuffer->Device, workDim, pGlobalWorkOffset, pGlobalWorkSize,
pLocalWorkSize, hKernel, HIPFunc, ThreadsPerBlock, BlocksPerGrid));
uint32_t LocalSize = hKernel->getLocalSize();
hipFunction_t HIPFunc = hKernel->get();
UR_CHECK_ERROR(setKernelParams(
hCommandBuffer->Device, workDim, pGlobalWorkOffset, pGlobalWorkSize,
pLocalWorkSize, hKernel, HIPFunc, ThreadsPerBlock, BlocksPerGrid));

try {
// Set node param structure with the kernel related data
auto &ArgIndices = hKernel->getArgIndices();
hipKernelNodeParams NodeParams;
Expand Down
Loading