Merge remote-tracking branch 'origin/sycl' into HEAD

aelovikov-intel · aelovikov-intel · commit 50b299026459 · 2024-12-13T11:28:02.000-08:00
diff --git a/.github/workflows/sycl-aws.yml b/.github/workflows/sycl-aws.yml
@@ -15,6 +15,9 @@ on:
         description: "JSON string with array of objects with aws-type, runs-on, aws-ami, aws-spot, aws-disk, aws-timebomb, one-job properties"
         type: string
         default: '[{"runs-on":"aws_cuda-${{ github.run_id }}-${{ github.run_attempt }}","aws-ami":"ami-01cb0573cb039ab24","aws-type":["g5.2xlarge","g5.4xlarge"],"aws-disk":"/dev/sda1:64","aws-spot":"false"}]'
+      ref:
+        type: string
+        required: false
 
 jobs:
   aws:
@@ -24,6 +27,7 @@ jobs:
       - uses: actions/checkout@v4
         with:
           sparse-checkout: devops/actions/aws-ec2
+          ref: ${{ inputs.ref || github.sha }}
       - run: npm install ./devops/actions/aws-ec2
       - uses: ./devops/actions/aws-ec2
         with:
diff --git a/.github/workflows/sycl-linux-build.yml b/.github/workflows/sycl-linux-build.yml
@@ -50,6 +50,9 @@ on:
         description: 'Artifacts retention period'
         type: string
         default: 3
+      ref:
+        type: string
+        required: false
 
     outputs:
       build_conclusion:
@@ -143,6 +146,7 @@ jobs:
       with:
         sparse-checkout: |
           devops/actions
+        ref: ${{ inputs.ref || github.sha }}
     # Cleanup will be run after all actions are completed.
     - name: Register cleanup after job is finished
       uses: ./devops/actions/cleanup
diff --git a/.github/workflows/sycl-rel-nightly.yml b/.github/workflows/sycl-rel-nightly.yml
@@ -0,0 +1,180 @@
+name: SYCL Release Branch Nightly
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: '0 3 * * *'
+
+permissions: read-all
+
+env:
+  # NOTE: so that GitHub Actions can trigger the scheduled workflow run - the
+  # workflow file should be on the default branch. Therefore every job should
+  # checkout the release branch.
+  SYCL_REL_BRANCH: "draft-sycl-rel-6_0_0"
+
+jobs:
+  # To avoid unnecessary scheduled runs this job checks if there are new commits
+  # since the last run. More precisely, it checks if the last commit is older
+  # than 24h. That means the previous Nightly already tested this commit.
+  check_for_new_commits:
+    runs-on: ubuntu-latest
+    name: Check for new commits
+    outputs:
+      is_new_commit: ${{ steps.is_new_commit.outputs.is_new_commit }}
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        ref: ${{ env.SYCL_REL_BRANCH }}
+    - run: git show --quiet | tee -a $GITHUB_STEP_SUMMARY
+
+    - id: is_new_commit
+      if: ${{ github.event_name == 'schedule' }}
+      run: |
+        if [ -z "$(git rev-list --after="24 hours" HEAD)" ]; then
+          echo "is_new_commit=false" >> $GITHUB_OUTPUT
+        fi
+
+  ubuntu2204_build:
+    needs: [check_for_new_commits]
+    if: ${{ github.repository == 'intel/llvm' && needs.check_for_new_commits.outputs.is_new_commit != 'false' }}
+    uses: ./.github/workflows/sycl-linux-build.yml
+    secrets: inherit
+    with:
+      build_cache_root: "/__w/"
+      build_artifact_suffix: v6
+      build_configure_extra_args: '--hip --cuda'
+      merge_ref: ''
+      ref: ${{ env.SYCL_REL_BRANCH }}
+
+      # We upload the build for people to download/use, override its name and
+      # prefer widespread gzip compression.
+      artifact_archive_name: sycl_linux.tar.gz
+
+  ubuntu2204_test:
+    needs: [ubuntu2204_build]
+    if: ${{ always() && !cancelled() && needs.ubuntu2204_build.outputs.build_conclusion == 'success' }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - name: AMD/HIP
+            runner: '["Linux", "amdgpu"]'
+            image: ghcr.io/intel/llvm/ubuntu2204_build:latest
+            image_options: -u 1001 --device=/dev/dri --device=/dev/kfd
+            target_devices: ext_oneapi_hip:gpu
+            tests_selector: e2e
+
+          - name: Intel L0 GPU
+            runner: '["Linux", "gen12"]'
+            image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest
+            image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
+            target_devices: level_zero:gpu
+            reset_intel_gpu: true
+            tests_selector: e2e
+            extra_lit_opts: --param gpu-intel-gen12=True
+
+          - name: Intel OCL GPU
+            runner: '["Linux", "gen12"]'
+            image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest
+            image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
+            target_devices: opencl:gpu
+            reset_intel_gpu: true
+            tests_selector: e2e
+            extra_lit_opts: --param gpu-intel-gen12=True
+
+          - name: Intel OCL CPU
+            runner: '["Linux", "gen12"]'
+            image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest
+            image_options: -u 1001 --privileged --cap-add SYS_ADMIN
+            target_devices: opencl:cpu
+            tests_selector: e2e
+
+          - name: SYCL-CTS on OCL CPU
+            runner: '["Linux", "gen12"]'
+            image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest
+            image_options: -u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN
+            target_devices: opencl:cpu
+            tests_selector: cts
+
+          - name: SYCL-CTS on L0 gen12
+            runner: '["Linux", "gen12"]'
+            image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest
+            image_options: -u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN
+            target_devices: level_zero:gpu
+            tests_selector: cts
+    uses: ./.github/workflows/sycl-linux-run-tests.yml
+    with:
+      name: ${{ matrix.name }}
+      runner: ${{ matrix.runner }}
+      image: ${{ matrix.image }}
+      image_options: ${{ matrix.image_options }}
+      target_devices: ${{ matrix.target_devices }}
+      tests_selector: ${{ matrix.tests_selector }}
+      extra_lit_opts: ${{ matrix.extra_lit_opts }}
+      reset_intel_gpu: ${{ matrix.reset_intel_gpu }}
+      ref: ${{ env.SYCL_REL_BRANCH }}
+      merge_ref: ''
+      sycl_toolchain_artifact: sycl_linux_default
+      sycl_toolchain_archive: ${{ needs.ubuntu2204_build.outputs.artifact_archive_name }}
+      sycl_toolchain_decompress_command: ${{ needs.ubuntu2204_build.outputs.artifact_decompress_command }}
+
+  build-win:
+    needs: [check_for_new_commits]
+    if: ${{ github.repository == 'intel/llvm' && needs.check_for_new_commits.outputs.is_new_commit != 'false' }}
+    uses: ./.github/workflows/sycl-windows-build.yml
+    with:
+      # We upload both Linux/Windows build via Github's "Releases"
+      # functionality, make sure Linux/Windows names follow the same pattern.
+      artifact_archive_name: sycl_windows.tar.gz
+      build_ref: ${{ env.SYCL_REL_BRANCH }}
+
+  e2e-win:
+    needs: build-win
+    # Continue if build was successful.
+    if: |
+      always()
+      && !cancelled()
+      && needs.build-win.outputs.build_conclusion == 'success'
+    uses: ./.github/workflows/sycl-windows-run-tests.yml
+    with:
+      name: Intel GEN12 Graphics with Level Zero
+      runner: '["Windows","gen12"]'
+      sycl_toolchain_archive: ${{ needs.build-win.outputs.artifact_archive_name }}
+      extra_lit_opts: --param gpu-intel-gen12=True
+      ref: ${{ env.SYCL_REL_BRANCH }}
+
+  cuda-aws-start:
+    needs: [ubuntu2204_build]
+    if: ${{ always() && !cancelled() && needs.ubuntu2204_build.outputs.build_conclusion == 'success' }}
+    uses: ./.github/workflows/sycl-aws.yml
+    secrets: inherit
+    with:
+      mode: start
+      ref: ${{ env.SYCL_REL_BRANCH }}
+
+  cuda-run-tests:
+    needs: [ubuntu2204_build, cuda-aws-start]
+    if: ${{ always() && !cancelled() && needs.ubuntu2204_build.outputs.build_conclusion == 'success' }}
+    uses: ./.github/workflows/sycl-linux-run-tests.yml
+    with:
+      name: CUDA E2E
+      runner: '["aws_cuda-${{ github.run_id }}-${{ github.run_attempt }}"]'
+      image: ghcr.io/intel/llvm/ubuntu2204_build:latest-0300ac924620a51f76c4929794637b82790f12ab
+      image_options: -u 1001 --gpus all --cap-add SYS_ADMIN --env NVIDIA_DISABLE_REQUIRE=1
+      target_devices: ext_oneapi_cuda:gpu
+      ref: ${{ env.SYCL_REL_BRANCH }}
+      merge_ref: ''
+
+      sycl_toolchain_artifact: sycl_linux_default
+      sycl_toolchain_archive: ${{ needs.ubuntu2204_build.outputs.artifact_archive_name }}
+      sycl_toolchain_decompress_command: ${{ needs.ubuntu2204_build.outputs.artifact_decompress_command }}
+
+  cuda-aws-stop:
+    needs: [cuda-aws-start, cuda-run-tests]
+    if: always()
+    uses: ./.github/workflows/sycl-aws.yml
+    secrets: inherit
+    with:
+      mode: stop
+      ref: ${{ env.SYCL_REL_BRANCH }}
diff --git a/clang/test/CodeGenSYCL/nvptx-short-ptr.cpp b/clang/test/CodeGenSYCL/nvptx-short-ptr.cpp
@@ -1,5 +1,6 @@
 // Check that when we see the expected data layouts for NVPTX when we pass the
 // -nvptx-short-ptr option.
+// REQUIRES: nvptx-registered-target
 
 // RUN: %clang_cc1 -fsycl-is-device -disable-llvm-passes \
 // RUN:  -triple nvptx-nvidia-cuda -emit-llvm %s -o - \
diff --git a/clang/test/Driver/sycl-instrumentation-old-model.c b/clang/test/Driver/sycl-instrumentation-old-model.c
@@ -20,19 +20,27 @@
 // CHECK-SPIRV-SAME: "{{.*}}libsycl-itt-stubs.bc"
 // CHECK-HOST-NOT: "-cc1"{{.*}} "-fsycl-is-host"{{.*}} "-fsycl-instrument-device-code"
 
-// ITT annotations in device code are disabled by default. However, for SYCL offloading,
-// we still link ITT annotations libraries to ensure ABI compatibility with previous release.
-// RUN: %clangxx -fsycl --no-offload-new-driver -fsycl-targets=spir64 -### %s 2>&1 \
+// ITT annotations in device code are disabled by default. However, for SYCL
+// offloading, we still link ITT annotations libraries to ensure ABI
+// compatibility with previous release.
+// RUN: %clangxx -fsycl --no-offload-new-driver -fsycl-targets=spir64 -### \
+// RUN:   --sysroot=%S/Inputs/SYCL %s 2>&1 \
 // RUN: | FileCheck -check-prefixes=CHECK-ITT-LINK-ONLY %s
-// RUN: %clangxx -fsycl --no-offload-new-driver -fsycl-targets=nvptx64-nvidia-cuda -nocudalib -### %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=CHECK-NONPASSED %s
 
 // CHECK-ITT-LINK-ONLY-NOT: "-fsycl-instrument-device-code"
 // CHECK-ITT-LINK-ONLY: llvm-link{{.*}} {{.*}}libsycl-itt-{{.*}}
 
-// RUN: %clangxx -fsycl --no-offload-new-driver -fno-sycl-instrument-device-code -fsycl-targets=spir64 -### %s 2>&1 \
+// Verify that ITT annotations are not pulled in for non-SPIR-V targets as
+// well as when device code instrumentation is explicitly turned off.
+// RUN: %clangxx -fsycl --no-offload-new-driver --sysroot=%S/Inputs/SYCL \
+// RUN:   -fsycl-targets=nvptx64-nvidia-cuda -nocudalib -### %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=CHECK-NONPASSED %s
+// RUN: %clangxx -fsycl --no-offload-new-driver --sysroot=%S/Inputs/SYCL \
+// RUN:   -fno-sycl-instrument-device-code -fsycl-targets=spir64 -### %s 2>&1 \
 // RUN: | FileCheck -check-prefixes=CHECK-NONPASSED %s
-// RUN: %clangxx -fsycl --no-offload-new-driver -fsycl-targets=nvptx64-nvidia-cuda -fno-sycl-instrument-device-code -nocudalib -### %s 2>&1 \
+// RUN: %clangxx -fsycl --no-offload-new-driver --sysroot=%s/Inputs/SYCL \
+// RUN:   -fsycl-targets=nvptx64-nvidia-cuda -fno-sycl-instrument-device-code \
+// RUN:   -nocudalib -### %s 2>&1 \
 // RUN: | FileCheck -check-prefixes=CHECK-NONPASSED %s
 
 // CHECK-NONPASSED-NOT: "-fsycl-instrument-device-code"
diff --git a/sycl/cmake/modules/UnifiedRuntimeTag.cmake b/sycl/cmake/modules/UnifiedRuntimeTag.cmake
@@ -1,7 +1,7 @@
-# commit 06f48f674445532d8c04be431474901b82c3c449
-# Merge: 098deca1f9f3 1b373f83c71e
+# commit b7047f6c36ec17b8560c2f1cd9ac9521715a9127
+# Merge: 73e5f3c6ff2d fcddf077c290
 # Author: Martin Grant <martin.morrisongrant@codeplay.com>
-# Date:   Thu Dec 12 11:04:15 2024 +0000
-#     Merge pull request #2356 from npmiller/hip-images
-#     [HIP] Disable SYCL images by default
-set(UNIFIED_RUNTIME_TAG 06f48f674445532d8c04be431474901b82c3c449)
+# Date:   Fri Dec 13 14:20:15 2024 +0000
+#     Merge pull request #2454 from Bensuo/l0_cmd-buf_multi-device
+#     Fix L0 command-buffer consumption of multi-device kernels
+set(UNIFIED_RUNTIME_TAG b7047f6c36ec17b8560c2f1cd9ac9521715a9127)
diff --git a/sycl/source/detail/helpers.cpp b/sycl/source/detail/helpers.cpp
@@ -94,7 +94,7 @@ retrieveKernelBinary(const QueueImplPtr &Queue, const char *KernelName,
     DeviceImage = &detail::ProgramManager::getInstance().getDeviceImage(
         KernelName, Context, Device);
     Program = detail::ProgramManager::getInstance().createURProgram(
-        *DeviceImage, Context, {Device});
+        *DeviceImage, Context, {std::move(Device)});
   }
   return {DeviceImage, Program};
 }
diff --git a/sycl/source/detail/persistent_device_code_cache.cpp b/sycl/source/detail/persistent_device_code_cache.cpp
@@ -320,7 +320,7 @@ std::vector<std::vector<char>> PersistentDeviceCodeCache::getItemFromDisc(
 std::vector<std::vector<char>>
 PersistentDeviceCodeCache::getCompiledKernelFromDisc(
     const std::vector<device> &Devices, const std::string &BuildOptionsString,
-    const std::string SourceStr) {
+    const std::string &SourceStr) {
   assert(!Devices.empty());
   std::vector<std::vector<char>> Binaries(Devices.size());
   std::string FileNames;
@@ -518,7 +518,7 @@ std::string PersistentDeviceCodeCache::getCacheItemPath(
 
 std::string PersistentDeviceCodeCache::getCompiledKernelItemPath(
     const device &Device, const std::string &BuildOptionsString,
-    const std::string SourceString) {
+    const std::string &SourceString) {
 
   std::string cache_root{getRootDir()};
   if (cache_root.empty()) {
diff --git a/sycl/source/detail/persistent_device_code_cache.hpp b/sycl/source/detail/persistent_device_code_cache.hpp
@@ -170,7 +170,7 @@ class PersistentDeviceCodeCache {
   static std::string
   getCompiledKernelItemPath(const device &Device,
                             const std::string &BuildOptionsString,
-                            const std::string SourceString);
+                            const std::string &SourceString);
 
   /* Program binaries built for one or more devices are read from persistent
    * cache and returned in form of vector of programs. Each binary program is
@@ -185,7 +185,7 @@ class PersistentDeviceCodeCache {
   static std::vector<std::vector<char>>
   getCompiledKernelFromDisc(const std::vector<device> &Devices,
                             const std::string &BuildOptionsString,
-                            const std::string SourceStr);
+                            const std::string &SourceStr);
 
   /* Stores build program in persistent cache
    */
diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp
@@ -76,7 +76,7 @@ static ur_program_handle_t
 createBinaryProgram(const ContextImplPtr Context,
                     const std::vector<device> &Devices,
                     const uint8_t **Binaries, size_t *Lengths,
-                    const std::vector<ur_program_metadata_t> Metadata) {
+                    const std::vector<ur_program_metadata_t> &Metadata) {
   const AdapterPtr &Adapter = Context->getAdapter();
   ur_program_handle_t Program;
   std::vector<ur_device_handle_t> DeviceHandles;
@@ -230,7 +230,7 @@ ProgramManager::createURProgram(const RTDeviceBinaryImage &Img,
         "SPIR-V online compilation is not supported in this context");
 
   // Get program metadata from properties
-  auto ProgMetadata = Img.getProgramMetadataUR();
+  const auto &ProgMetadata = Img.getProgramMetadataUR();
 
   // Load the image
   const ContextImplPtr Ctx = getSyclObjImpl(Context);
@@ -990,7 +990,15 @@ ur_program_handle_t ProgramManager::getBuiltURProgram(
     // emplace all subsets of the current set of devices into the cache.
     // Set of all devices is not included in the loop as it was already added
     // into the cache.
-    for (int Mask = 1; Mask < (1 << URDevicesSet.size()) - 1; ++Mask) {
+    int Mask = 1;
+    if (URDevicesSet.size() > sizeof(Mask) * 8 - 1) {
+      // Protection for the algorithm below. Although overflow is very unlikely
+      // to be reached.
+      throw sycl::exception(
+          make_error_code(errc::runtime),
+          "Unable to cache built program for more than 31 devices");
+    }
+    for (; Mask < (1 << URDevicesSet.size()) - 1; ++Mask) {
       std::set<ur_device_handle_t> Subset;
       int Index = 0;
       for (auto It = URDevicesSet.begin(); It != URDevicesSet.end();
@@ -1124,7 +1132,7 @@ ProgramManager::getUrProgramFromUrKernel(ur_kernel_handle_t Kernel,
 
 std::string
 ProgramManager::getProgramBuildLog(const ur_program_handle_t &Program,
-                                   const ContextImplPtr Context) {
+                                   const ContextImplPtr &Context) {
   size_t URDevicesSize = 0;
   const AdapterPtr &Adapter = Context->getAdapter();
   Adapter->call<UrApiKind::urProgramGetInfo>(Program, UR_PROGRAM_INFO_DEVICES,
diff --git a/sycl/source/detail/program_manager/program_manager.hpp b/sycl/source/detail/program_manager/program_manager.hpp
@@ -220,7 +220,7 @@ class ProgramManager {
   void addImages(sycl_device_binaries DeviceImages);
   void debugPrintBinaryImages() const;
   static std::string getProgramBuildLog(const ur_program_handle_t &Program,
-                                        const ContextImplPtr Context);
+                                        const ContextImplPtr &Context);
 
   uint32_t getDeviceLibReqMask(const RTDeviceBinaryImage &Img);
 
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
@@ -59,7 +59,7 @@ namespace detail {
 template <typename MemOpFuncT, typename... MemOpArgTs>
 ur_result_t callMemOpHelper(MemOpFuncT &MemOpFunc, MemOpArgTs &&...MemOpArgs) {
   try {
-    MemOpFunc(MemOpArgs...);
+    MemOpFunc(std::forward<MemOpArgTs>(MemOpArgs)...);
   } catch (sycl::exception &e) {
     return static_cast<ur_result_t>(get_ur_error(e));
   }
@@ -70,7 +70,7 @@ template <typename MemOpRet, typename MemOpFuncT, typename... MemOpArgTs>
 ur_result_t callMemOpHelperRet(MemOpRet &MemOpResult, MemOpFuncT &MemOpFunc,
                                MemOpArgTs &&...MemOpArgs) {
   try {
-    MemOpResult = MemOpFunc(MemOpArgs...);
+    MemOpResult = MemOpFunc(std::forward<MemOpArgTs>(MemOpArgs)...);
   } catch (sycl::exception &e) {
     return static_cast<ur_result_t>(get_ur_error(e));
   }
@@ -2891,7 +2891,7 @@ ur_result_t ExecCGCommand::enqueueImpCommandBuffer() {
                                                        &RawEvents[0]);
   }
 
-  ur_exp_command_buffer_sync_point_t OutSyncPoint;
+  ur_exp_command_buffer_sync_point_t OutSyncPoint{};
   ur_exp_command_buffer_command_handle_t OutCommand = nullptr;
   switch (MCommandGroup->getType()) {
   case CGType::Kernel: {
diff --git a/sycl/test-e2e/Basic/subdevice_pi.cpp b/sycl/test-e2e/Basic/subdevice_pi.cpp
diff --git a/sycl/test-e2e/USM/fill_any_size.cpp b/sycl/test-e2e/USM/fill_any_size.cpp
diff --git a/sycl/test-e2e/bindless_images/vulkan_interop/sampled_images.cpp b/sycl/test-e2e/bindless_images/vulkan_interop/sampled_images.cpp
diff --git a/sycl/test-e2e/bindless_images/vulkan_interop/sampled_images_semaphore.cpp b/sycl/test-e2e/bindless_images/vulkan_interop/sampled_images_semaphore.cpp
diff --git a/sycl/test/basic_tests/vectors/swizzle.cpp b/sycl/test/basic_tests/vectors/swizzle.cpp

Original file line number	Diff line number	Diff line change
`@@ -94,7 +94,7 @@ retrieveKernelBinary(const QueueImplPtr &Queue, const char *KernelName,`
`94`	`94`	`DeviceImage = &detail::ProgramManager::getInstance().getDeviceImage(`
`95`	`95`	`KernelName, Context, Device);`
`96`	`96`	`Program = detail::ProgramManager::getInstance().createURProgram(`
`97`		`- *DeviceImage, Context, {Device});`
	`97`	`+ *DeviceImage, Context, {std::move(Device)});`
`98`	`98`	`}`
`99`	`99`	`return {DeviceImage, Program};`
`100`	`100`	`}`