Skip to content

Commit 09b0e8b

Browse files
authored
Merge branch 'main' into fabio/binary_update_fix
2 parents 017d8f3 + cc2d590 commit 09b0e8b

File tree

9 files changed

+43
-46
lines changed

9 files changed

+43
-46
lines changed

.github/docker/install_dpcpp.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,5 +16,5 @@ if [ "${SKIP_DPCPP_BUILD}" ]; then
1616
fi
1717

1818
mkdir -p ${DPCPP_PATH}/dpcpp_compiler
19-
wget -O ${DPCPP_PATH}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-01-29/sycl_linux.tar.gz
19+
wget -O ${DPCPP_PATH}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-09-27/sycl_linux.tar.gz
2020
tar -xvf ${DPCPP_PATH}/dpcpp_compiler.tar.gz -C ${DPCPP_PATH}/dpcpp_compiler

.github/workflows/cmake.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ jobs:
7979
if: matrix.os == 'ubuntu-22.04'
8080
run: |
8181
sudo apt install libncurses5
82-
wget -O ${{github.workspace}}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-01-29/sycl_linux.tar.gz
82+
wget -O ${{github.workspace}}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-09-27/sycl_linux.tar.gz
8383
mkdir -p ${{github.workspace}}/dpcpp_compiler
8484
tar -xvf ${{github.workspace}}/dpcpp_compiler.tar.gz -C ${{github.workspace}}/dpcpp_compiler
8585

.github/workflows/multi_device.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ jobs:
3333
# TODO: enable once test failure are fixed/ignored
3434
# - name: Download DPC++
3535
# run: |
36-
# wget -O ${{github.workspace}}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-01-29/sycl_linux.tar.gz
36+
# wget -O ${{github.workspace}}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-09-27/sycl_linux.tar.gz
3737
# mkdir dpcpp_compiler
3838
# tar -xvf ${{github.workspace}}/dpcpp_compiler.tar.gz -C dpcpp_compiler
3939

source/adapters/cuda/command_buffer.cpp

Lines changed: 17 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -369,14 +369,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
369369
UR_RESULT_ERROR_INVALID_VALUE);
370370
}
371371

372-
CUgraphNode GraphNode;
372+
try {
373+
CUgraphNode GraphNode;
373374

374-
std::vector<CUgraphNode> DepsList;
375-
UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
376-
pSyncPointWaitList, DepsList));
375+
std::vector<CUgraphNode> DepsList;
376+
UR_CHECK_ERROR(getNodesFromSyncPoints(
377+
hCommandBuffer, numSyncPointsInWaitList, pSyncPointWaitList, DepsList));
377378

378-
if (*pGlobalWorkSize == 0) {
379-
try {
379+
if (*pGlobalWorkSize == 0) {
380380
// Create an empty node if the kernel workload size is zero
381381
UR_CHECK_ERROR(cuGraphAddEmptyNode(&GraphNode, hCommandBuffer->CudaGraph,
382382
DepsList.data(), DepsList.size()));
@@ -386,25 +386,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
386386
if (pSyncPoint) {
387387
*pSyncPoint = SyncPoint;
388388
}
389-
} catch (ur_result_t Err) {
390-
return Err;
389+
return UR_RESULT_SUCCESS;
391390
}
392-
return UR_RESULT_SUCCESS;
393-
}
394391

395-
// Set the number of threads per block to the number of threads per warp
396-
// by default unless user has provided a better number
397-
size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
398-
size_t BlocksPerGrid[3] = {1u, 1u, 1u};
392+
// Set the number of threads per block to the number of threads per warp
393+
// by default unless user has provided a better number
394+
size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
395+
size_t BlocksPerGrid[3] = {1u, 1u, 1u};
399396

400-
uint32_t LocalSize = hKernel->getLocalSize();
401-
CUfunction CuFunc = hKernel->get();
402-
UR_CHECK_ERROR(
403-
setKernelParams(hCommandBuffer->Context, hCommandBuffer->Device, workDim,
404-
pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
405-
hKernel, CuFunc, ThreadsPerBlock, BlocksPerGrid));
397+
uint32_t LocalSize = hKernel->getLocalSize();
398+
CUfunction CuFunc = hKernel->get();
399+
UR_CHECK_ERROR(setKernelParams(
400+
hCommandBuffer->Context, hCommandBuffer->Device, workDim,
401+
pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, hKernel, CuFunc,
402+
ThreadsPerBlock, BlocksPerGrid));
406403

407-
try {
408404
// Set node param structure with the kernel related data
409405
auto &ArgIndices = hKernel->getArgIndices();
410406
CUDA_KERNEL_NODE_PARAMS NodeParams = {};

source/adapters/hip/command_buffer.cpp

Lines changed: 16 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -339,14 +339,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
339339
UR_RESULT_ERROR_INVALID_VALUE);
340340
}
341341

342-
hipGraphNode_t GraphNode;
343-
std::vector<hipGraphNode_t> DepsList;
342+
try {
343+
hipGraphNode_t GraphNode;
344+
std::vector<hipGraphNode_t> DepsList;
344345

345-
UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
346-
pSyncPointWaitList, DepsList));
346+
UR_CHECK_ERROR(getNodesFromSyncPoints(
347+
hCommandBuffer, numSyncPointsInWaitList, pSyncPointWaitList, DepsList));
347348

348-
if (*pGlobalWorkSize == 0) {
349-
try {
349+
if (*pGlobalWorkSize == 0) {
350350
// Create an empty node if the kernel workload size is zero
351351
UR_CHECK_ERROR(hipGraphAddEmptyNode(&GraphNode, hCommandBuffer->HIPGraph,
352352
DepsList.data(), DepsList.size()));
@@ -356,24 +356,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
356356
if (pSyncPoint) {
357357
*pSyncPoint = SyncPoint;
358358
}
359-
} catch (ur_result_t Err) {
360-
return Err;
359+
return UR_RESULT_SUCCESS;
361360
}
362-
return UR_RESULT_SUCCESS;
363-
}
364361

365-
// Set the number of threads per block to the number of threads per warp
366-
// by default unless user has provided a better number
367-
size_t ThreadsPerBlock[3] = {64u, 1u, 1u};
368-
size_t BlocksPerGrid[3] = {1u, 1u, 1u};
362+
// Set the number of threads per block to the number of threads per warp
363+
// by default unless user has provided a better number
364+
size_t ThreadsPerBlock[3] = {64u, 1u, 1u};
365+
size_t BlocksPerGrid[3] = {1u, 1u, 1u};
369366

370-
uint32_t LocalSize = hKernel->getLocalSize();
371-
hipFunction_t HIPFunc = hKernel->get();
372-
UR_CHECK_ERROR(setKernelParams(
373-
hCommandBuffer->Device, workDim, pGlobalWorkOffset, pGlobalWorkSize,
374-
pLocalWorkSize, hKernel, HIPFunc, ThreadsPerBlock, BlocksPerGrid));
367+
uint32_t LocalSize = hKernel->getLocalSize();
368+
hipFunction_t HIPFunc = hKernel->get();
369+
UR_CHECK_ERROR(setKernelParams(
370+
hCommandBuffer->Device, workDim, pGlobalWorkOffset, pGlobalWorkSize,
371+
pLocalWorkSize, hKernel, HIPFunc, ThreadsPerBlock, BlocksPerGrid));
375372

376-
try {
377373
// Set node param structure with the kernel related data
378374
auto &ArgIndices = hKernel->getArgIndices();
379375
hipKernelNodeParams NodeParams;

test/conformance/device_code/subgroup.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,11 @@ struct KernelFunctor {
1111
KernelFunctor(sycl::accessor<size_t, 1, sycl::access_mode::write> Acc)
1212
: Acc(Acc) {}
1313

14+
auto get(sycl::ext::oneapi::experimental::properties_tag) {
15+
return sycl::ext::oneapi::experimental::properties{
16+
sycl::ext::oneapi::experimental::sub_group_size<8>};
17+
}
18+
1419
void operator()(sycl::nd_item<1> NdItem) const {
1520
auto SG = NdItem.get_sub_group();
1621
if (NdItem.get_global_linear_id() == 0) {

test/conformance/enqueue/enqueue_adapter_level_zero_v2.match

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
urEnqueueDeviceGetGlobalVariableReadTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
33
urEnqueueKernelLaunchTest.InvalidKernelArgs/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
44
urEnqueueKernelLaunchKernelWgSizeTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
5-
urEnqueueKernelLaunchKernelSubGroupTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
65
urEnqueueKernelLaunchWithVirtualMemory.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
76
urEnqueueKernelLaunchUSMLinkedList.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled
87
urEnqueueKernelLaunchUSMLinkedList.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled
Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
{{NONDETERMINISTIC}}
22
{{OPT}}urEnqueueDeviceGetGlobalVariableReadTest.Success/Intel_R__OpenCL___{{.*}}_
33
urEnqueueKernelLaunchKernelWgSizeTest.Success/Intel_R__OpenCL___{{.*}}_
4-
urEnqueueKernelLaunchKernelSubGroupTest.Success/Intel_R__OpenCL___{{.*}}_
54
{{OPT}}urEnqueueKernelLaunchUSMLinkedList.Success/Intel_R__OpenCL___{{.*}}_UsePoolEnabled

test/conformance/enqueue/urEnqueueKernelLaunch.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,8 @@ TEST_P(urEnqueueKernelLaunchKernelSubGroupTest, Success) {
180180
queue, kernel, n_dimensions, global_offset.data(), global_size.data(),
181181
nullptr, 0, nullptr, nullptr));
182182
ASSERT_SUCCESS(urQueueFinish(queue));
183+
// We specify this subgroup size in the kernel source, and then the kernel
184+
// queries for its subgroup size at runtime and writes it to the buffer.
183185
ValidateBuffer<size_t>(buffer, sizeof(size_t), 8);
184186
}
185187

0 commit comments

Comments
 (0)