Merge branch 'main' into fabio/binary_update_fix

fabiomestre · web-flow · commit 09b0e8b5e3f0 · 2024-10-01T17:54:21.000+01:00
diff --git a/.github/docker/install_dpcpp.sh b/.github/docker/install_dpcpp.sh
@@ -16,5 +16,5 @@ if [ "${SKIP_DPCPP_BUILD}" ]; then
 fi
 
 mkdir -p ${DPCPP_PATH}/dpcpp_compiler
-wget -O ${DPCPP_PATH}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-01-29/sycl_linux.tar.gz
+wget -O ${DPCPP_PATH}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-09-27/sycl_linux.tar.gz
 tar -xvf ${DPCPP_PATH}/dpcpp_compiler.tar.gz -C ${DPCPP_PATH}/dpcpp_compiler
diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
@@ -79,7 +79,7 @@ jobs:
       if: matrix.os == 'ubuntu-22.04'
       run: |
         sudo apt install libncurses5
-        wget -O ${{github.workspace}}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-01-29/sycl_linux.tar.gz
+        wget -O ${{github.workspace}}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-09-27/sycl_linux.tar.gz
         mkdir -p ${{github.workspace}}/dpcpp_compiler
         tar -xvf ${{github.workspace}}/dpcpp_compiler.tar.gz -C ${{github.workspace}}/dpcpp_compiler
 
diff --git a/.github/workflows/multi_device.yml b/.github/workflows/multi_device.yml
@@ -33,7 +33,7 @@ jobs:
     # TODO: enable once test failure are fixed/ignored
     # - name: Download DPC++
     #   run: |
-    #     wget -O ${{github.workspace}}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-01-29/sycl_linux.tar.gz
+    #     wget -O ${{github.workspace}}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-09-27/sycl_linux.tar.gz
     #     mkdir dpcpp_compiler
     #     tar -xvf ${{github.workspace}}/dpcpp_compiler.tar.gz -C dpcpp_compiler
 
diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp
@@ -369,14 +369,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
               UR_RESULT_ERROR_INVALID_VALUE);
   }
 
-  CUgraphNode GraphNode;
+  try {
+    CUgraphNode GraphNode;
 
-  std::vector<CUgraphNode> DepsList;
-  UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
-                                        pSyncPointWaitList, DepsList));
+    std::vector<CUgraphNode> DepsList;
+    UR_CHECK_ERROR(getNodesFromSyncPoints(
+        hCommandBuffer, numSyncPointsInWaitList, pSyncPointWaitList, DepsList));
 
-  if (*pGlobalWorkSize == 0) {
-    try {
+    if (*pGlobalWorkSize == 0) {
       // Create an empty node if the kernel workload size is zero
       UR_CHECK_ERROR(cuGraphAddEmptyNode(&GraphNode, hCommandBuffer->CudaGraph,
                                          DepsList.data(), DepsList.size()));
@@ -386,25 +386,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
       if (pSyncPoint) {
         *pSyncPoint = SyncPoint;
       }
-    } catch (ur_result_t Err) {
-      return Err;
+      return UR_RESULT_SUCCESS;
     }
-    return UR_RESULT_SUCCESS;
-  }
 
-  // Set the number of threads per block to the number of threads per warp
-  // by default unless user has provided a better number
-  size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
-  size_t BlocksPerGrid[3] = {1u, 1u, 1u};
+    // Set the number of threads per block to the number of threads per warp
+    // by default unless user has provided a better number
+    size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
+    size_t BlocksPerGrid[3] = {1u, 1u, 1u};
 
-  uint32_t LocalSize = hKernel->getLocalSize();
-  CUfunction CuFunc = hKernel->get();
-  UR_CHECK_ERROR(
-      setKernelParams(hCommandBuffer->Context, hCommandBuffer->Device, workDim,
-                      pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
-                      hKernel, CuFunc, ThreadsPerBlock, BlocksPerGrid));
+    uint32_t LocalSize = hKernel->getLocalSize();
+    CUfunction CuFunc = hKernel->get();
+    UR_CHECK_ERROR(setKernelParams(
+        hCommandBuffer->Context, hCommandBuffer->Device, workDim,
+        pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, hKernel, CuFunc,
+        ThreadsPerBlock, BlocksPerGrid));
 
-  try {
     // Set node param structure with the kernel related data
     auto &ArgIndices = hKernel->getArgIndices();
     CUDA_KERNEL_NODE_PARAMS NodeParams = {};
diff --git a/source/adapters/hip/command_buffer.cpp b/source/adapters/hip/command_buffer.cpp
@@ -339,14 +339,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
               UR_RESULT_ERROR_INVALID_VALUE);
   }
 
-  hipGraphNode_t GraphNode;
-  std::vector<hipGraphNode_t> DepsList;
+  try {
+    hipGraphNode_t GraphNode;
+    std::vector<hipGraphNode_t> DepsList;
 
-  UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
-                                        pSyncPointWaitList, DepsList));
+    UR_CHECK_ERROR(getNodesFromSyncPoints(
+        hCommandBuffer, numSyncPointsInWaitList, pSyncPointWaitList, DepsList));
 
-  if (*pGlobalWorkSize == 0) {
-    try {
+    if (*pGlobalWorkSize == 0) {
       // Create an empty node if the kernel workload size is zero
       UR_CHECK_ERROR(hipGraphAddEmptyNode(&GraphNode, hCommandBuffer->HIPGraph,
                                           DepsList.data(), DepsList.size()));
@@ -356,24 +356,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
       if (pSyncPoint) {
         *pSyncPoint = SyncPoint;
       }
-    } catch (ur_result_t Err) {
-      return Err;
+      return UR_RESULT_SUCCESS;
     }
-    return UR_RESULT_SUCCESS;
-  }
 
-  // Set the number of threads per block to the number of threads per warp
-  // by default unless user has provided a better number
-  size_t ThreadsPerBlock[3] = {64u, 1u, 1u};
-  size_t BlocksPerGrid[3] = {1u, 1u, 1u};
+    // Set the number of threads per block to the number of threads per warp
+    // by default unless user has provided a better number
+    size_t ThreadsPerBlock[3] = {64u, 1u, 1u};
+    size_t BlocksPerGrid[3] = {1u, 1u, 1u};
 
-  uint32_t LocalSize = hKernel->getLocalSize();
-  hipFunction_t HIPFunc = hKernel->get();
-  UR_CHECK_ERROR(setKernelParams(
-      hCommandBuffer->Device, workDim, pGlobalWorkOffset, pGlobalWorkSize,
-      pLocalWorkSize, hKernel, HIPFunc, ThreadsPerBlock, BlocksPerGrid));
+    uint32_t LocalSize = hKernel->getLocalSize();
+    hipFunction_t HIPFunc = hKernel->get();
+    UR_CHECK_ERROR(setKernelParams(
+        hCommandBuffer->Device, workDim, pGlobalWorkOffset, pGlobalWorkSize,
+        pLocalWorkSize, hKernel, HIPFunc, ThreadsPerBlock, BlocksPerGrid));
 
-  try {
     // Set node param structure with the kernel related data
     auto &ArgIndices = hKernel->getArgIndices();
     hipKernelNodeParams NodeParams;
diff --git a/test/conformance/device_code/subgroup.cpp b/test/conformance/device_code/subgroup.cpp
@@ -11,6 +11,11 @@ struct KernelFunctor {
     KernelFunctor(sycl::accessor<size_t, 1, sycl::access_mode::write> Acc)
         : Acc(Acc) {}
 
+    auto get(sycl::ext::oneapi::experimental::properties_tag) {
+        return sycl::ext::oneapi::experimental::properties{
+            sycl::ext::oneapi::experimental::sub_group_size<8>};
+    }
+
     void operator()(sycl::nd_item<1> NdItem) const {
         auto SG = NdItem.get_sub_group();
         if (NdItem.get_global_linear_id() == 0) {
diff --git a/test/conformance/enqueue/enqueue_adapter_level_zero_v2.match b/test/conformance/enqueue/enqueue_adapter_level_zero_v2.match
@@ -2,7 +2,6 @@
 urEnqueueDeviceGetGlobalVariableReadTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
 urEnqueueKernelLaunchTest.InvalidKernelArgs/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
 urEnqueueKernelLaunchKernelWgSizeTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urEnqueueKernelLaunchKernelSubGroupTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
 urEnqueueKernelLaunchWithVirtualMemory.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
 urEnqueueKernelLaunchUSMLinkedList.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled
 urEnqueueKernelLaunchUSMLinkedList.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled
diff --git a/test/conformance/enqueue/enqueue_adapter_opencl.match b/test/conformance/enqueue/enqueue_adapter_opencl.match
@@ -1,5 +1,4 @@
 {{NONDETERMINISTIC}}
 {{OPT}}urEnqueueDeviceGetGlobalVariableReadTest.Success/Intel_R__OpenCL___{{.*}}_
 urEnqueueKernelLaunchKernelWgSizeTest.Success/Intel_R__OpenCL___{{.*}}_
-urEnqueueKernelLaunchKernelSubGroupTest.Success/Intel_R__OpenCL___{{.*}}_
 {{OPT}}urEnqueueKernelLaunchUSMLinkedList.Success/Intel_R__OpenCL___{{.*}}_UsePoolEnabled
diff --git a/test/conformance/enqueue/urEnqueueKernelLaunch.cpp b/test/conformance/enqueue/urEnqueueKernelLaunch.cpp
@@ -180,6 +180,8 @@ TEST_P(urEnqueueKernelLaunchKernelSubGroupTest, Success) {
         queue, kernel, n_dimensions, global_offset.data(), global_size.data(),
         nullptr, 0, nullptr, nullptr));
     ASSERT_SUCCESS(urQueueFinish(queue));
+    // We specify this subgroup size in the kernel source, and then the kernel
+    // queries for its subgroup size at runtime and writes it to the buffer.
     ValidateBuffer<size_t>(buffer, sizeof(size_t), 8);
 }
 

Original file line number	Diff line number	Diff line change
`@@ -180,6 +180,8 @@ TEST_P(urEnqueueKernelLaunchKernelSubGroupTest, Success) {`
`180`	`180`	`queue, kernel, n_dimensions, global_offset.data(), global_size.data(),`
`181`	`181`	`nullptr, 0, nullptr, nullptr));`
`182`	`182`	`ASSERT_SUCCESS(urQueueFinish(queue));`
	`183`	`+ // We specify this subgroup size in the kernel source, and then the kernel`
	`184`	`+ // queries for its subgroup size at runtime and writes it to the buffer.`
`183`	`185`	`ValidateBuffer<size_t>(buffer, sizeof(size_t), 8);`
`184`	`186`	`}`
`185`	`187`