diff --git a/buildbot/check.py b/buildbot/check.py index 70e98c1a701ea..6d84031a6dfb5 100644 --- a/buildbot/check.py +++ b/buildbot/check.py @@ -6,6 +6,7 @@ DEFAULT_CPU_COUNT = 4 + def do_check(args): try: cpu_count = multiprocessing.cpu_count() @@ -14,45 +15,67 @@ def do_check(args): # Get absolute path to source directory if args.src_dir: - abs_src_dir = os.path.abspath(args.src_dir) + abs_src_dir = os.path.abspath(args.src_dir) else: - abs_src_dir = os.path.abspath(os.path.join(__file__, "../..")) + abs_src_dir = os.path.abspath(os.path.join(__file__, "../..")) # Get absolute path to build directory if args.obj_dir: - abs_obj_dir = os.path.abspath(args.obj_dir) + abs_obj_dir = os.path.abspath(args.obj_dir) else: - abs_obj_dir = os.path.join(abs_src_dir, "build") + abs_obj_dir = os.path.join(abs_src_dir, "build") cmake_cmd = [ "cmake", - "--build", abs_obj_dir, + "--build", + abs_obj_dir, "--", args.test_suite, - "-j", str(cpu_count)] + "-j", + str(cpu_count), + ] print("[Cmake Command]: {}".format(" ".join(cmake_cmd))) - env_tmp=os.environ - env_tmp["LIT_ARGS"]="\"{}\"".format("-v") + env_tmp = os.environ + env_tmp["LIT_ARGS"] = '"{}"'.format("-v") subprocess.check_call(cmake_cmd, cwd=abs_obj_dir, env=env_tmp) ret = True return ret + def main(): - parser = argparse.ArgumentParser(prog="check.py", - description="script to do LIT testing", - formatter_class=argparse.RawTextHelpFormatter) - parser.add_argument("-n", "--build-number", metavar="BUILD_NUM", help="build number") + parser = argparse.ArgumentParser( + prog="check.py", + description="script to do LIT testing", + formatter_class=argparse.RawTextHelpFormatter, + ) + parser.add_argument( + "-n", "--build-number", metavar="BUILD_NUM", help="build number" + ) parser.add_argument("-b", "--branch", metavar="BRANCH", help="pull request branch") - parser.add_argument("-d", "--base-branch", metavar="BASE_BRANCH", help="pull request base branch") - parser.add_argument("-r", "--pr-number", metavar="PR_NUM", help="pull request number") - parser.add_argument("-w", "--builder-dir", metavar="BUILDER_DIR", - help="builder directory, which is the directory containing source and build directories") + parser.add_argument( + "-d", "--base-branch", metavar="BASE_BRANCH", help="pull request base branch" + ) + parser.add_argument( + "-r", "--pr-number", metavar="PR_NUM", help="pull request number" + ) + parser.add_argument( + "-w", + "--builder-dir", + metavar="BUILDER_DIR", + help="builder directory, which is the directory containing source and build directories", + ) parser.add_argument("-s", "--src-dir", metavar="SRC_DIR", help="source directory") parser.add_argument("-o", "--obj-dir", metavar="OBJ_DIR", help="build directory") - parser.add_argument("-t", "--test-suite", metavar="TEST_SUITE", default="check-all", help="check-xxx target") + parser.add_argument( + "-t", + "--test-suite", + metavar="TEST_SUITE", + default="check-all", + help="check-xxx target", + ) args = parser.parse_args() @@ -60,6 +83,7 @@ def main(): return do_check(args) + if __name__ == "__main__": ret = main() exit_code = 0 if ret else 1 diff --git a/buildbot/clang_tidy.py b/buildbot/clang_tidy.py index c9c9eb37d7b68..3d5e46ac0df9c 100644 --- a/buildbot/clang_tidy.py +++ b/buildbot/clang_tidy.py @@ -5,10 +5,16 @@ FILE_EXTENSIONS = [".h", ".hpp", ".c", ".cc", ".cpp"] + def do_clang_tidy(args): ret = False - merge_base_cmd = ["git", "merge-base", "origin/{}".format(args.base_branch), args.branch] + merge_base_cmd = [ + "git", + "merge-base", + "origin/{}".format(args.base_branch), + args.branch, + ] print(merge_base_cmd) base_commit = subprocess.check_output(merge_base_cmd, cwd=args.src_dir) base_commit = base_commit.rstrip() @@ -19,13 +25,15 @@ def do_clang_tidy(args): diff_cmd = ["git", "--no-pager", "diff", base_commit, args.branch, "--name-only"] print(diff_cmd) - with open(changed_files, 'w') as f: - subprocess.check_call(merge_base_cmd, cwd=args.src_dir, stdout=f, stderr=subprocess.STDOUT) + with open(changed_files, "w") as f: + subprocess.check_call( + merge_base_cmd, cwd=args.src_dir, stdout=f, stderr=subprocess.STDOUT + ) if os.path.isfile(changed_files): clang_tidy_binary = os.path.join(args.obj_dir, "bin", "clang-tidy") if os.path.isfile(clang_tidy_binary): - with open(changed_files, 'r') as f: + with open(changed_files, "r") as f: for line in f: filename, file_extension = os.path.splitext(line) if file_extension.lower() in FILE_EXTENSIONS: @@ -41,19 +49,42 @@ def do_clang_tidy(args): return ret + def main(): - parser = argparse.ArgumentParser(prog="clang_tidy.py", - description="script to do clang_tidy", - formatter_class=argparse.RawTextHelpFormatter) - parser.add_argument("-n", "--build-number", metavar="BUILD_NUM", help="build number") - parser.add_argument("-b", "--branch", metavar="BRANCH", required=True, help="pull request branch") - parser.add_argument("-d", "--base-branch", metavar="BASE_BRANCH", required=True, - help="pull request base branch") - parser.add_argument("-r", "--pr-number", metavar="PR_NUM", help="pull request number") - parser.add_argument("-w", "--builder-dir", metavar="BUILDER_DIR", required=True, - help="builder directory, which is the directory containing source and build directories") - parser.add_argument("-s", "--src-dir", metavar="SRC_DIR", required=True, help="source directory") - parser.add_argument("-o", "--obj-dir", metavar="OBJ_DIR", required=True, help="build directory") + parser = argparse.ArgumentParser( + prog="clang_tidy.py", + description="script to do clang_tidy", + formatter_class=argparse.RawTextHelpFormatter, + ) + parser.add_argument( + "-n", "--build-number", metavar="BUILD_NUM", help="build number" + ) + parser.add_argument( + "-b", "--branch", metavar="BRANCH", required=True, help="pull request branch" + ) + parser.add_argument( + "-d", + "--base-branch", + metavar="BASE_BRANCH", + required=True, + help="pull request base branch", + ) + parser.add_argument( + "-r", "--pr-number", metavar="PR_NUM", help="pull request number" + ) + parser.add_argument( + "-w", + "--builder-dir", + metavar="BUILDER_DIR", + required=True, + help="builder directory, which is the directory containing source and build directories", + ) + parser.add_argument( + "-s", "--src-dir", metavar="SRC_DIR", required=True, help="source directory" + ) + parser.add_argument( + "-o", "--obj-dir", metavar="OBJ_DIR", required=True, help="build directory" + ) args = parser.parse_args() @@ -61,8 +92,8 @@ def main(): return do_clang_tidy(args) + if __name__ == "__main__": ret = main() exit_code = 0 if ret else 1 sys.exit(exit_code) - diff --git a/buildbot/compile.py b/buildbot/compile.py index b1c8e22ed1537..055abd6dab739 100644 --- a/buildbot/compile.py +++ b/buildbot/compile.py @@ -18,24 +18,27 @@ def do_compile(args): # Get absolute path to source directory if args.src_dir: - abs_src_dir = os.path.abspath(args.src_dir) + abs_src_dir = os.path.abspath(args.src_dir) else: - abs_src_dir = os.path.abspath(os.path.join(__file__, "../..")) + abs_src_dir = os.path.abspath(os.path.join(__file__, "../..")) # Get absolute path to build directory if args.obj_dir: - abs_obj_dir = os.path.abspath(args.obj_dir) + abs_obj_dir = os.path.abspath(args.obj_dir) else: - abs_obj_dir = os.path.join(abs_src_dir, "build") + abs_obj_dir = os.path.join(abs_src_dir, "build") cmake_cmd = [ "cmake", - "--build", abs_obj_dir, + "--build", + abs_obj_dir, "--", args.build_target, - "-j", str(cpu_count)] + "-j", + str(cpu_count), + ] if args.verbose: - cmake_cmd.append("--verbose") + cmake_cmd.append("--verbose") print("[Cmake Command]: {}".format(" ".join(cmake_cmd))) @@ -45,20 +48,45 @@ def do_compile(args): def main(): - parser = argparse.ArgumentParser(prog="compile.py", - description="script to do compile", - formatter_class=argparse.RawTextHelpFormatter) - parser.add_argument("-n", "--build-number", metavar="BUILD_NUM", help="build number") + parser = argparse.ArgumentParser( + prog="compile.py", + description="script to do compile", + formatter_class=argparse.RawTextHelpFormatter, + ) + parser.add_argument( + "-n", "--build-number", metavar="BUILD_NUM", help="build number" + ) parser.add_argument("-b", "--branch", metavar="BRANCH", help="pull request branch") - parser.add_argument("-d", "--base-branch", metavar="BASE_BRANCH", help="pull request base branch") - parser.add_argument("-r", "--pr-number", metavar="PR_NUM", help="pull request number") - parser.add_argument("-w", "--builder-dir", metavar="BUILDER_DIR", - help="builder directory, which is the directory containing source and build directories") + parser.add_argument( + "-d", "--base-branch", metavar="BASE_BRANCH", help="pull request base branch" + ) + parser.add_argument( + "-r", "--pr-number", metavar="PR_NUM", help="pull request number" + ) + parser.add_argument( + "-w", + "--builder-dir", + metavar="BUILDER_DIR", + help="builder directory, which is the directory containing source and build directories", + ) parser.add_argument("-s", "--src-dir", metavar="SRC_DIR", help="source directory") parser.add_argument("-o", "--obj-dir", metavar="OBJ_DIR", help="build directory") - parser.add_argument("-j", "--build-parallelism", metavar="BUILD_PARALLELISM", help="build parallelism") - parser.add_argument("-v", "--verbose", action='store_true', help="verbose build output") - parser.add_argument("-t", "--build-target", metavar="BUILD_TARGET", default="deploy-sycl-toolchain", help="set build target") + parser.add_argument( + "-j", + "--build-parallelism", + metavar="BUILD_PARALLELISM", + help="build parallelism", + ) + parser.add_argument( + "-v", "--verbose", action="store_true", help="verbose build output" + ) + parser.add_argument( + "-t", + "--build-target", + metavar="BUILD_TARGET", + default="deploy-sycl-toolchain", + help="set build target", + ) args = parser.parse_args() diff --git a/buildbot/configure.py b/buildbot/configure.py index f172be352ba7d..fc89f8b7b00bf 100644 --- a/buildbot/configure.py +++ b/buildbot/configure.py @@ -5,24 +5,31 @@ import subprocess import sys + def do_configure(args): # Get absolute path to source directory - abs_src_dir = os.path.abspath(args.src_dir if args.src_dir else os.path.join(__file__, "../..")) + abs_src_dir = os.path.abspath( + args.src_dir if args.src_dir else os.path.join(__file__, "../..") + ) # Get absolute path to build directory - abs_obj_dir = os.path.abspath(args.obj_dir) if args.obj_dir else os.path.join(abs_src_dir, "build") + abs_obj_dir = ( + os.path.abspath(args.obj_dir) + if args.obj_dir + else os.path.join(abs_src_dir, "build") + ) # Create build directory if it doesn't exist if not os.path.isdir(abs_obj_dir): - os.makedirs(abs_obj_dir) + os.makedirs(abs_obj_dir) - llvm_external_projects = 'sycl;llvm-spirv;opencl;xpti;xptifw' + llvm_external_projects = "sycl;llvm-spirv;opencl;xpti;xptifw" # libdevice build requires a working SYCL toolchain, which is not the case # with macOS target right now. if sys.platform != "darwin": - llvm_external_projects += ';libdevice' + llvm_external_projects += ";libdevice" - libclc_amd_target_names = ';amdgcn--amdhsa' - libclc_nvidia_target_names = ';nvptx64--nvidiacl' + libclc_amd_target_names = ";amdgcn--amdhsa" + libclc_nvidia_target_names = ";nvptx64--nvidiacl" sycl_enable_fusion = "OFF" if not args.disable_fusion: @@ -40,50 +47,50 @@ def do_configure(args): libdevice_dir = os.path.join(abs_src_dir, "libdevice") fusion_dir = os.path.join(abs_src_dir, "sycl-fusion") llvm_targets_to_build = args.host_target - llvm_enable_projects = 'clang;' + llvm_external_projects - libclc_build_native = 'OFF' - libclc_targets_to_build = '' - libclc_gen_remangled_variants = 'OFF' - sycl_build_pi_hip_platform = 'AMD' - sycl_clang_extra_flags = '' - sycl_werror = 'OFF' - llvm_enable_assertions = 'ON' - llvm_enable_doxygen = 'OFF' - llvm_enable_sphinx = 'OFF' - llvm_build_shared_libs = 'OFF' - llvm_enable_lld = 'OFF' + llvm_enable_projects = "clang;" + llvm_external_projects + libclc_build_native = "OFF" + libclc_targets_to_build = "" + libclc_gen_remangled_variants = "OFF" + sycl_build_pi_hip_platform = "AMD" + sycl_clang_extra_flags = "" + sycl_werror = "OFF" + llvm_enable_assertions = "ON" + llvm_enable_doxygen = "OFF" + llvm_enable_sphinx = "OFF" + llvm_build_shared_libs = "OFF" + llvm_enable_lld = "OFF" sycl_enabled_plugins = ["opencl"] - sycl_preview_lib = 'ON' + sycl_preview_lib = "ON" - sycl_enable_xpti_tracing = 'ON' - xpti_enable_werror = 'OFF' + sycl_enable_xpti_tracing = "ON" + xpti_enable_werror = "OFF" if sys.platform != "darwin": sycl_enabled_plugins.append("level_zero") # lld is needed on Windows or for the HIP plugin on AMD - if platform.system() == 'Windows' or (args.hip and args.hip_platform == 'AMD'): - llvm_enable_projects += ';lld' + if platform.system() == "Windows" or (args.hip and args.hip_platform == "AMD"): + llvm_enable_projects += ";lld" libclc_enabled = args.cuda or args.hip or args.native_cpu if libclc_enabled: - llvm_enable_projects += ';libclc' + llvm_enable_projects += ";libclc" if args.cuda: - llvm_targets_to_build += ';NVPTX' + llvm_targets_to_build += ";NVPTX" libclc_targets_to_build = libclc_nvidia_target_names - libclc_gen_remangled_variants = 'ON' + libclc_gen_remangled_variants = "ON" sycl_enabled_plugins.append("cuda") if args.hip: - if args.hip_platform == 'AMD': - llvm_targets_to_build += ';AMDGPU' + if args.hip_platform == "AMD": + llvm_targets_to_build += ";AMDGPU" libclc_targets_to_build += libclc_amd_target_names - elif args.hip_platform == 'NVIDIA' and not args.cuda: - llvm_targets_to_build += ';NVPTX' + elif args.hip_platform == "NVIDIA" and not args.cuda: + llvm_targets_to_build += ";NVPTX" libclc_targets_to_build += libclc_nvidia_target_names - libclc_gen_remangled_variants = 'ON' + libclc_gen_remangled_variants = "ON" sycl_build_pi_hip_platform = args.hip_platform sycl_enabled_plugins.append("hip") @@ -96,28 +103,27 @@ def do_configure(args): libclc_gen_remangled_variants = "ON" sycl_enabled_plugins.append("native_cpu") - # all llvm compiler targets don't require 3rd party dependencies, so can be # built/tested even if specific runtimes are not available if args.enable_all_llvm_targets: - llvm_targets_to_build += ';NVPTX;AMDGPU' + llvm_targets_to_build += ";NVPTX;AMDGPU" if args.werror or args.ci_defaults: - sycl_werror = 'ON' - xpti_enable_werror = 'ON' + sycl_werror = "ON" + xpti_enable_werror = "ON" if args.no_assertions: - llvm_enable_assertions = 'OFF' + llvm_enable_assertions = "OFF" if args.docs: - llvm_enable_doxygen = 'ON' - llvm_enable_sphinx = 'ON' + llvm_enable_doxygen = "ON" + llvm_enable_sphinx = "ON" if args.shared_libs: - llvm_build_shared_libs = 'ON' + llvm_build_shared_libs = "ON" if args.use_lld: - llvm_enable_lld = 'ON' + llvm_enable_lld = "ON" # CI Default conditionally appends to options, keep it at the bottom of # args handling @@ -131,32 +137,33 @@ def do_configure(args): if sys.platform != "darwin": # libclc is required for CI validation libclc_enabled = True - if 'libclc' not in llvm_enable_projects: - llvm_enable_projects += ';libclc' + if "libclc" not in llvm_enable_projects: + llvm_enable_projects += ";libclc" # libclc passes `--nvvm-reflect-enable=false`, build NVPTX to enable it - if 'NVPTX' not in llvm_targets_to_build: - llvm_targets_to_build += ';NVPTX' + if "NVPTX" not in llvm_targets_to_build: + llvm_targets_to_build += ";NVPTX" # since we are building AMD libclc target we must have AMDGPU target - if 'AMDGPU' not in llvm_targets_to_build: - llvm_targets_to_build += ';AMDGPU' + if "AMDGPU" not in llvm_targets_to_build: + llvm_targets_to_build += ";AMDGPU" # Add both NVIDIA and AMD libclc targets if libclc_amd_target_names not in libclc_targets_to_build: libclc_targets_to_build += libclc_amd_target_names if libclc_nvidia_target_names not in libclc_targets_to_build: libclc_targets_to_build += libclc_nvidia_target_names - libclc_gen_remangled_variants = 'ON' + libclc_gen_remangled_variants = "ON" if args.enable_plugin: sycl_enabled_plugins += args.enable_plugin if args.disable_preview_lib: - sycl_preview_lib = 'OFF' + sycl_preview_lib = "OFF" install_dir = os.path.join(abs_obj_dir, "install") cmake_cmd = [ "cmake", - "-G", args.cmake_gen, + "-G", + args.cmake_gen, "-DCMAKE_BUILD_TYPE={}".format(args.build_type), "-DLLVM_ENABLE_ASSERTIONS={}".format(llvm_enable_assertions), "-DLLVM_TARGETS_TO_BUILD={}".format(llvm_targets_to_build), @@ -173,7 +180,7 @@ def do_configure(args): "-DLLVM_BUILD_TOOLS=ON", "-DSYCL_ENABLE_WERROR={}".format(sycl_werror), "-DCMAKE_INSTALL_PREFIX={}".format(install_dir), - "-DSYCL_INCLUDE_TESTS=ON", # Explicitly include all kinds of SYCL tests. + "-DSYCL_INCLUDE_TESTS=ON", # Explicitly include all kinds of SYCL tests. "-DLLVM_ENABLE_DOXYGEN={}".format(llvm_enable_doxygen), "-DLLVM_ENABLE_SPHINX={}".format(llvm_enable_sphinx), "-DBUILD_SHARED_LIBS={}".format(llvm_build_shared_libs), @@ -181,7 +188,7 @@ def do_configure(args): "-DLLVM_ENABLE_LLD={}".format(llvm_enable_lld), "-DXPTI_ENABLE_WERROR={}".format(xpti_enable_werror), "-DSYCL_CLANG_EXTRA_FLAGS={}".format(sycl_clang_extra_flags), - "-DSYCL_ENABLE_PLUGINS={}".format(';'.join(set(sycl_enabled_plugins))), + "-DSYCL_ENABLE_PLUGINS={}".format(";".join(set(sycl_enabled_plugins))), "-DSYCL_ENABLE_KERNEL_FUSION={}".format(sycl_enable_fusion), "-DSYCL_ENABLE_MAJOR_RELEASE_PREVIEW_LIB={}".format(sycl_preview_lib), "-DBUG_REPORT_URL=https://github.com/intel/llvm/issues", @@ -199,31 +206,41 @@ def do_configure(args): ) if args.l0_headers and args.l0_loader: - cmake_cmd.extend([ - "-DLEVEL_ZERO_INCLUDE_DIR={}".format(args.l0_headers), - "-DLEVEL_ZERO_LIBRARY={}".format(args.l0_loader)]) + cmake_cmd.extend( + [ + "-DLEVEL_ZERO_INCLUDE_DIR={}".format(args.l0_headers), + "-DLEVEL_ZERO_LIBRARY={}".format(args.l0_loader), + ] + ) elif args.l0_headers or args.l0_loader: - sys.exit("Please specify both Level Zero headers and loader or don't specify " - "none of them to let download from github.com") + sys.exit( + "Please specify both Level Zero headers and loader or don't specify " + "none of them to let download from github.com" + ) # Add additional CMake options if provided if args.cmake_opt: - cmake_cmd += args.cmake_opt - + cmake_cmd += args.cmake_opt + if args.add_security_flags: - cmake_cmd.extend(["-DEXTRA_SECURITY_FLAGS={}".format(args.add_security_flags)]) + cmake_cmd.extend(["-DEXTRA_SECURITY_FLAGS={}".format(args.add_security_flags)]) # Add path to root CMakeLists.txt cmake_cmd.append(llvm_dir) if args.use_libcxx: - if not (args.libcxx_include and args.libcxx_library): - sys.exit("Please specify include and library path of libc++ when building sycl " - "runtime with it") - cmake_cmd.extend([ - "-DSYCL_USE_LIBCXX=ON", - "-DSYCL_LIBCXX_INCLUDE_PATH={}".format(args.libcxx_include), - "-DSYCL_LIBCXX_LIBRARY_PATH={}".format(args.libcxx_library)]) + if not (args.libcxx_include and args.libcxx_library): + sys.exit( + "Please specify include and library path of libc++ when building sycl " + "runtime with it" + ) + cmake_cmd.extend( + [ + "-DSYCL_USE_LIBCXX=ON", + "-DSYCL_LIBCXX_INCLUDE_PATH={}".format(args.libcxx_include), + "-DSYCL_LIBCXX_LIBRARY_PATH={}".format(args.libcxx_library), + ] + ) print("[Cmake Command]: {}".format(" ".join(map(shlex.quote, cmake_cmd)))) @@ -232,61 +249,158 @@ def do_configure(args): except subprocess.CalledProcessError: cmake_cache = os.path.join(abs_obj_dir, "CMakeCache.txt") if os.path.isfile(cmake_cache): - print("There is CMakeCache.txt at " + cmake_cache + - " ... you can try to remove it and rerun.") - print("Configure failed!") + print( + "There is CMakeCache.txt at " + + cmake_cache + + " ... you can try to remove it and rerun." + ) + print("Configure failed!") return False return True + def main(): - parser = argparse.ArgumentParser(prog="configure.py", - description="Generate build files from CMake configuration files", - formatter_class=argparse.RawTextHelpFormatter) + parser = argparse.ArgumentParser( + prog="configure.py", + description="Generate build files from CMake configuration files", + formatter_class=argparse.RawTextHelpFormatter, + ) # CI system options - parser.add_argument("-n", "--build-number", metavar="BUILD_NUM", help="build number") + parser.add_argument( + "-n", "--build-number", metavar="BUILD_NUM", help="build number" + ) parser.add_argument("-b", "--branch", metavar="BRANCH", help="pull request branch") - parser.add_argument("-d", "--base-branch", metavar="BASE_BRANCH", help="pull request base branch") - parser.add_argument("-r", "--pr-number", metavar="PR_NUM", help="pull request number") - parser.add_argument("-w", "--builder-dir", metavar="BUILDER_DIR", - help="builder directory, which is the directory containing source and build directories") + parser.add_argument( + "-d", "--base-branch", metavar="BASE_BRANCH", help="pull request base branch" + ) + parser.add_argument( + "-r", "--pr-number", metavar="PR_NUM", help="pull request number" + ) + parser.add_argument( + "-w", + "--builder-dir", + metavar="BUILDER_DIR", + help="builder directory, which is the directory containing source and build directories", + ) # User options - parser.add_argument("-s", "--src-dir", metavar="SRC_DIR", help="source directory (autodetected by default)") - parser.add_argument("-o", "--obj-dir", metavar="OBJ_DIR", help="build directory. (/build by default)") - parser.add_argument("--l0-headers", metavar="L0_HEADER_DIR", help="directory with Level Zero headers") - parser.add_argument("--l0-loader", metavar="L0_LOADER", help="path to the Level Zero loader") - parser.add_argument("-t", "--build-type", - metavar="BUILD_TYPE", default="Release", help="build type: Debug, Release") - parser.add_argument("--cuda", action='store_true', help="switch from OpenCL to CUDA") - parser.add_argument("--native_cpu", action='store_true', help="Enable SYCL Native CPU") - parser.add_argument("--hip", action='store_true', help="switch from OpenCL to HIP") - parser.add_argument("--hip-platform", type=str, choices=['AMD', 'NVIDIA'], default='AMD', help="choose hardware platform for HIP backend") - parser.add_argument("--host-target", default='host', - help="host LLVM target architecture, defaults to \'host\', multiple targets may be provided as a semi-colon separated string") - parser.add_argument("--enable-all-llvm-targets", action='store_true', help="build compiler with all supported targets, it doesn't change runtime build") - parser.add_argument("--no-assertions", action='store_true', help="build without assertions") - parser.add_argument("--docs", action='store_true', help="build Doxygen documentation") - parser.add_argument("--werror", action='store_true', help="Treat warnings as errors") - parser.add_argument("--shared-libs", action='store_true', help="Build shared libraries") - parser.add_argument("--cmake-opt", action='append', help="Additional CMake option not configured via script parameters") + parser.add_argument( + "-s", + "--src-dir", + metavar="SRC_DIR", + help="source directory (autodetected by default)", + ) + parser.add_argument( + "-o", + "--obj-dir", + metavar="OBJ_DIR", + help="build directory. (/build by default)", + ) + parser.add_argument( + "--l0-headers", + metavar="L0_HEADER_DIR", + help="directory with Level Zero headers", + ) + parser.add_argument( + "--l0-loader", metavar="L0_LOADER", help="path to the Level Zero loader" + ) + parser.add_argument( + "-t", + "--build-type", + metavar="BUILD_TYPE", + default="Release", + help="build type: Debug, Release", + ) + parser.add_argument( + "--cuda", action="store_true", help="switch from OpenCL to CUDA" + ) + parser.add_argument( + "--native_cpu", action="store_true", help="Enable SYCL Native CPU" + ) + parser.add_argument("--hip", action="store_true", help="switch from OpenCL to HIP") + parser.add_argument( + "--hip-platform", + type=str, + choices=["AMD", "NVIDIA"], + default="AMD", + help="choose hardware platform for HIP backend", + ) + parser.add_argument( + "--host-target", + default="host", + help="host LLVM target architecture, defaults to 'host', multiple targets may be provided as a semi-colon separated string", + ) + parser.add_argument( + "--enable-all-llvm-targets", + action="store_true", + help="build compiler with all supported targets, it doesn't change runtime build", + ) + parser.add_argument( + "--no-assertions", action="store_true", help="build without assertions" + ) + parser.add_argument( + "--docs", action="store_true", help="build Doxygen documentation" + ) + parser.add_argument( + "--werror", action="store_true", help="Treat warnings as errors" + ) + parser.add_argument( + "--shared-libs", action="store_true", help="Build shared libraries" + ) + parser.add_argument( + "--cmake-opt", + action="append", + help="Additional CMake option not configured via script parameters", + ) parser.add_argument("--cmake-gen", default="Ninja", help="CMake generator") - parser.add_argument("--use-libcxx", action="store_true", help="build sycl runtime with libcxx") - parser.add_argument("--libcxx-include", metavar="LIBCXX_INCLUDE_PATH", help="libcxx include path") - parser.add_argument("--libcxx-library", metavar="LIBCXX_LIBRARY_PATH", help="libcxx library path") - parser.add_argument("--use-lld", action="store_true", help="Use LLD linker for build") - parser.add_argument("--llvm-external-projects", help="Add external projects to build. Add as comma seperated list.") - parser.add_argument("--ci-defaults", action="store_true", help="Enable default CI parameters") - parser.add_argument("--enable-plugin", action='append', help="Enable SYCL plugin") - parser.add_argument("--disable-preview-lib", action='store_true', help="Disable building of the SYCL runtime major release preview library") - parser.add_argument("--disable-fusion", action="store_true", help="Disable the kernel fusion JIT compiler") - parser.add_argument("--add_security_flags", type=str, choices=['none', 'default', 'sanitize'], default=None, help="Enables security flags for compile & link. Two values are supported: 'default' and 'sanitize'. 'Sanitize' option is an extension of 'default' set.") - parser.add_argument('--native-cpu-libclc-targets', help='Target triples for libclc, used by the Native CPU backend') + parser.add_argument( + "--use-libcxx", action="store_true", help="build sycl runtime with libcxx" + ) + parser.add_argument( + "--libcxx-include", metavar="LIBCXX_INCLUDE_PATH", help="libcxx include path" + ) + parser.add_argument( + "--libcxx-library", metavar="LIBCXX_LIBRARY_PATH", help="libcxx library path" + ) + parser.add_argument( + "--use-lld", action="store_true", help="Use LLD linker for build" + ) + parser.add_argument( + "--llvm-external-projects", + help="Add external projects to build. Add as comma seperated list.", + ) + parser.add_argument( + "--ci-defaults", action="store_true", help="Enable default CI parameters" + ) + parser.add_argument("--enable-plugin", action="append", help="Enable SYCL plugin") + parser.add_argument( + "--disable-preview-lib", + action="store_true", + help="Disable building of the SYCL runtime major release preview library", + ) + parser.add_argument( + "--disable-fusion", + action="store_true", + help="Disable the kernel fusion JIT compiler", + ) + parser.add_argument( + "--add_security_flags", + type=str, + choices=["none", "default", "sanitize"], + default=None, + help="Enables security flags for compile & link. Two values are supported: 'default' and 'sanitize'. 'Sanitize' option is an extension of 'default' set.", + ) + parser.add_argument( + "--native-cpu-libclc-targets", + help="Target triples for libclc, used by the Native CPU backend", + ) args = parser.parse_args() print("args:{}".format(args)) return do_configure(args) + if __name__ == "__main__": ret = main() exit_code = 0 if ret else 1 diff --git a/buildbot/dependency.py b/buildbot/dependency.py index e79eae2b62cc1..3eac3cc7a92a8 100644 --- a/buildbot/dependency.py +++ b/buildbot/dependency.py @@ -4,6 +4,7 @@ import subprocess import sys + def do_dependency(args): ret = False @@ -11,7 +12,8 @@ def do_dependency(args): if args.pr_number is not None and not args.clean_build: if args.branch is None or args.base_branch is None: "branch ({}) and base branch ({}) is required for pull request #{}".format( - args.branch, args.base_branch, args.pr_number) + args.branch, args.base_branch, args.pr_number + ) return ret # fetching the recent state of base branch fetch_cmd = ["git", "fetch", "origin", args.base_branch] @@ -25,14 +27,31 @@ def do_dependency(args): print(checkout_cmd) subprocess.check_call(checkout_cmd, cwd=args.src_dir) # get baseline commit - merge_base_cmd = ["git", "merge-base", "origin/{}".format(args.base_branch), args.branch] + merge_base_cmd = [ + "git", + "merge-base", + "origin/{}".format(args.base_branch), + args.branch, + ] print(merge_base_cmd) base_commit = subprocess.check_output(merge_base_cmd, cwd=args.src_dir) - base_commit = base_commit.rstrip() - diff_cmd = ["git", "--no-pager", "diff", base_commit, args.branch, "--name-only", "buildbot"] + base_commit = base_commit.rstrip() + diff_cmd = [ + "git", + "--no-pager", + "diff", + base_commit, + args.branch, + "--name-only", + "buildbot", + ] print(diff_cmd) changed_build_scripts = subprocess.check_output(diff_cmd, cwd=args.src_dir) - changed_build_scripts = changed_build_scripts.rstrip() if changed_build_scripts is not None else None + changed_build_scripts = ( + changed_build_scripts.rstrip() + if changed_build_scripts is not None + else None + ) # clean build directory if build scripts have changed if len(changed_build_scripts) > 0: if os.path.isdir(args.obj_dir): @@ -49,8 +68,14 @@ def do_dependency(args): # fetch OpenCL headers ocl_header_dir = os.path.join(args.obj_dir, "OpenCL-Headers") if not os.path.isdir(ocl_header_dir): - clone_cmd = ["git", "clone", "https://github.com/KhronosGroup/OpenCL-Headers", - "OpenCL-Headers", "-b", "main"] + clone_cmd = [ + "git", + "clone", + "https://github.com/KhronosGroup/OpenCL-Headers", + "OpenCL-Headers", + "-b", + "main", + ] subprocess.check_call(clone_cmd, cwd=args.obj_dir) else: fetch_cmd = ["git", "pull", "--ff", "--ff-only", "origin"] @@ -64,9 +89,14 @@ def do_dependency(args): # fetch and build OpenCL ICD loader icd_loader_dir = os.path.join(args.obj_dir, "OpenCL-ICD-Loader") if not os.path.isdir(icd_loader_dir): - clone_cmd = ["git", "clone", - "https://github.com/KhronosGroup/OpenCL-ICD-Loader", - "OpenCL-ICD-Loader", "-b", "main"] + clone_cmd = [ + "git", + "clone", + "https://github.com/KhronosGroup/OpenCL-ICD-Loader", + "OpenCL-ICD-Loader", + "-b", + "main", + ] subprocess.check_call(clone_cmd, cwd=args.obj_dir) else: @@ -83,36 +113,60 @@ def do_dependency(args): shutil.rmtree(icd_build_dir) os.makedirs(icd_build_dir) install_dir = os.path.join(args.obj_dir, "install") - cmake_cmd = ["cmake", "-G", "Ninja", - "-DCMAKE_INSTALL_PREFIX={}".format(install_dir), - "-DOPENCL_ICD_LOADER_HEADERS_DIR={}".format(ocl_header_dir), - ".." ] + cmake_cmd = [ + "cmake", + "-G", + "Ninja", + "-DCMAKE_INSTALL_PREFIX={}".format(install_dir), + "-DOPENCL_ICD_LOADER_HEADERS_DIR={}".format(ocl_header_dir), + "..", + ] print("[Cmake Command]: {}".format(" ".join(cmake_cmd))) - + subprocess.check_call(cmake_cmd, cwd=icd_build_dir) - env_tmp=os.environ + env_tmp = os.environ env_tmp["C_INCLUDE_PATH"] = "{}".format(ocl_header_dir) subprocess.check_call(["ninja", "install"], env=env_tmp, cwd=icd_build_dir) ret = True return ret + def main(): - parser = argparse.ArgumentParser(prog="dependency.py", - description="script to get and build dependency", - formatter_class=argparse.RawTextHelpFormatter) - parser.add_argument("-n", "--build-number", metavar="BUILD_NUM", help="build number") + parser = argparse.ArgumentParser( + prog="dependency.py", + description="script to get and build dependency", + formatter_class=argparse.RawTextHelpFormatter, + ) + parser.add_argument( + "-n", "--build-number", metavar="BUILD_NUM", help="build number" + ) parser.add_argument("-b", "--branch", metavar="BRANCH", help="pull request branch") - parser.add_argument("-d", "--base-branch", metavar="BASE_BRANCH", help="pull request base branch") - parser.add_argument("-r", "--pr-number", metavar="PR_NUM", help="pull request number") - parser.add_argument("-w", "--builder-dir", metavar="BUILDER_DIR", - help="builder directory, which is the directory containing source and build directories") + parser.add_argument( + "-d", "--base-branch", metavar="BASE_BRANCH", help="pull request base branch" + ) + parser.add_argument( + "-r", "--pr-number", metavar="PR_NUM", help="pull request number" + ) + parser.add_argument( + "-w", + "--builder-dir", + metavar="BUILDER_DIR", + help="builder directory, which is the directory containing source and build directories", + ) parser.add_argument("-s", "--src-dir", metavar="SRC_DIR", help="source directory") - parser.add_argument("-o", "--obj-dir", metavar="OBJ_DIR", required=True, help="build directory") - parser.add_argument("-c", "--clean-build", action="store_true", default=False, - help="true if the build is clean build which has clobber step") + parser.add_argument( + "-o", "--obj-dir", metavar="OBJ_DIR", required=True, help="build directory" + ) + parser.add_argument( + "-c", + "--clean-build", + action="store_true", + default=False, + help="true if the build is clean build which has clobber step", + ) args = parser.parse_args() @@ -120,6 +174,7 @@ def main(): return do_dependency(args) + if __name__ == "__main__": ret = main() exit_code = 0 if ret else 1 diff --git a/devops/dependencies.json b/devops/dependencies.json index 00a1d7db2b692..6340db87bc5a2 100644 --- a/devops/dependencies.json +++ b/devops/dependencies.json @@ -1,15 +1,15 @@ { "linux": { "compute_runtime": { - "github_tag": "24.22.29735.20", - "version": "24.22.29735.20", - "url": "https://github.com/intel/compute-runtime/releases/tag/24.22.29735.20", + "github_tag": "24.26.30049.6", + "version": "24.26.30049.6", + "url": "https://github.com/intel/compute-runtime/releases/tag/24.26.30049.6", "root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu" }, "igc": { - "github_tag": "igc-1.0.16900.23", - "version": "1.0.16900.23", - "url": "https://github.com/intel/intel-graphics-compiler/releases/tag/igc-1.0.16900.23", + "github_tag": "igc-1.0.17193.4", + "version": "1.0.17193.4", + "url": "https://github.com/intel/intel-graphics-compiler/releases/tag/igc-1.0.17193.4", "root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu" }, "cm": { @@ -19,9 +19,9 @@ "root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu" }, "level_zero": { - "github_tag": "v1.17.17", - "version": "v1.17.17", - "url": "https://github.com/oneapi-src/level-zero/releases/tag/v1.17.17", + "github_tag": "v1.17.19", + "version": "v1.17.19", + "url": "https://github.com/oneapi-src/level-zero/releases/tag/v1.17.19", "root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu" }, "tbb": { diff --git a/libclc/ptx-nvidiacl/libspirv/synchronization/barrier.cl b/libclc/ptx-nvidiacl/libspirv/synchronization/barrier.cl index 1cf3fb8750c2e..eb011986659eb 100644 --- a/libclc/ptx-nvidiacl/libspirv/synchronization/barrier.cl +++ b/libclc/ptx-nvidiacl/libspirv/synchronization/barrier.cl @@ -45,10 +45,54 @@ _CLC_OVERLOAD _CLC_DEF void __spirv_MemoryBarrier(unsigned int memory, _CLC_OVERLOAD _CLC_DEF _CLC_CONVERGENT void __spirv_ControlBarrier(unsigned int scope, unsigned int memory, unsigned int semantics) { + unsigned int order = semantics & 0x1F; if (scope == Subgroup) { // use a full mask as barriers are required to be convergent and exited // threads can safely be in the mask __nvvm_bar_warp_sync(0xFFFFFFFF); + } else if (scope == Device && memory == Device && + order == SequentiallyConsistent && + __clc_nvvm_reflect_arch() >= 700) { + unsigned int env1, env2; + __asm__ __volatile__("mov.u32 %0, %%envreg1;" : "=r"(env1)); + __asm__ __volatile__("mov.u32 %0, %%envreg2;" : "=r"(env2)); + long long envreg1 = env1; + long long envreg2 = env2; + // Bit field insert operation. Place 32 bits of envreg2 next to 32 bits of + // envreg1: s64[envreg2][envreg1]. The resulting value is the address in + // device global memory region, where atomic operations can be performed. + long long atomicAddr; + __asm__ __volatile__("bfi.b64 %0, %1, %2, 32, 32;" + : "=l"(atomicAddr) + : "l"(envreg1), "l"(envreg2)); + if (!atomicAddr) { + __builtin_trap(); + } else { + unsigned int tidX = __nvvm_read_ptx_sreg_tid_x(); + unsigned int tidY = __nvvm_read_ptx_sreg_tid_y(); + unsigned int tidZ = __nvvm_read_ptx_sreg_tid_z(); + if (tidX + tidY + tidZ == 0) { + // Increment address by 4 to get the precise region initialized to 0. + atomicAddr += 4; + unsigned int nctaidX = __nvvm_read_ptx_sreg_nctaid_x(); + unsigned int nctaidY = __nvvm_read_ptx_sreg_nctaid_y(); + unsigned int nctaidZ = __nvvm_read_ptx_sreg_nctaid_z(); + unsigned int totalNctaid = nctaidX * nctaidY * nctaidZ; + + // Do atomic.add(1) for each CTA and spin ld.acquire in a loop until all + // CTAs have performed the addition + unsigned int prev, current; + __asm__ __volatile__("atom.add.release.gpu.u32 %0,[%1],1;" + : "=r"(prev) + : "l"(atomicAddr)); + do { + __asm__ __volatile__("ld.acquire.gpu.u32 %0,[%1];" + : "=r"(current) + : "l"(atomicAddr)); + } while (current % totalNctaid != 0); + } + __nvvm_barrier_sync(0); + } } else { __syncthreads(); } diff --git a/libdevice/sanitizer_utils.cpp b/libdevice/sanitizer_utils.cpp index f59bc17bb948e..c2a4f7124dbff 100644 --- a/libdevice/sanitizer_utils.cpp +++ b/libdevice/sanitizer_utils.cpp @@ -862,4 +862,35 @@ __asan_set_shadow_dynamic_local(uptr ptr, uint32_t num_args) { __spirv_ocl_printf(__mem_set_shadow_dynamic_local_end); } +/// +/// ASAN initialize shdadow memory of private memory +/// + +static __SYCL_CONSTANT__ const char __mem_set_shadow_private_begin[] = + "[kernel] BEGIN __asan_set_shadow_private\n"; +static __SYCL_CONSTANT__ const char __mem_set_shadow_private_end[] = + "[kernel] END __asan_set_shadow_private\n"; +static __SYCL_CONSTANT__ const char __mem_set_shadow_private[] = + "[kernel] set_shadow_private(beg=%p, end=%p, val:%02X)\n"; + +DEVICE_EXTERN_C_NOINLINE void __asan_set_shadow_private(uptr begin, uptr size, + char val) { + if (__AsanDebug) + __spirv_ocl_printf(__mem_set_shadow_private_begin); + + auto *launch_info = (__SYCL_GLOBAL__ const LaunchInfo *)__AsanLaunchInfo; + if (launch_info->PrivateShadowOffset == 0) + return; + + if (__AsanDebug) + __spirv_ocl_printf(__mem_set_shadow_private, (void *)begin, + (void *)(begin + size), val & 0xFF); + + for (size_t i = 0; i < size; i++) + ((__SYCL_GLOBAL__ u8 *)begin)[i] = val; + + if (__AsanDebug) + __spirv_ocl_printf(__mem_set_shadow_private_end); +} + #endif // __SPIR__ || __SPIRV__ diff --git a/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td b/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td index 4939808595680..4fef4c918351d 100644 --- a/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td +++ b/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td @@ -53,8 +53,8 @@ def AspectExt_oneapi_bindless_images : Aspect<"ext_oneapi_bindless_images">; def AspectExt_oneapi_bindless_images_shared_usm : Aspect<"ext_oneapi_bindless_images_shared_usm">; def AspectExt_oneapi_bindless_images_1d_usm : Aspect<"ext_oneapi_bindless_images_1d_usm">; def AspectExt_oneapi_bindless_images_2d_usm : Aspect<"ext_oneapi_bindless_images_2d_usm">; -def AspectExt_oneapi_interop_memory_import : Aspect<"ext_oneapi_interop_memory_import">; -def AspectExt_oneapi_interop_semaphore_import : Aspect<"ext_oneapi_interop_semaphore_import">; +def AspectExt_oneapi_external_memory_import : Aspect<"ext_oneapi_external_memory_import">; +def AspectExt_oneapi_external_semaphore_import : Aspect<"ext_oneapi_external_semaphore_import">; def AspectExt_oneapi_mipmap : Aspect<"ext_oneapi_mipmap">; def AspectExt_oneapi_mipmap_anisotropy : Aspect<"ext_oneapi_mipmap_anisotropy">; def AspectExt_oneapi_mipmap_level_reference : Aspect<"ext_oneapi_mipmap_level_reference">; @@ -130,9 +130,9 @@ def : TargetInfo<"__TestAspectList", AspectExt_intel_device_id, AspectExt_intel_memory_clock_rate, AspectExt_intel_memory_bus_width, AspectEmulated, AspectExt_intel_legacy_image, AspectExt_oneapi_bindless_images, AspectExt_oneapi_bindless_images_shared_usm, AspectExt_oneapi_bindless_images_1d_usm, AspectExt_oneapi_bindless_images_2d_usm, - AspectExt_oneapi_interop_memory_import, AspectExt_oneapi_interop_semaphore_import, + AspectExt_oneapi_external_memory_import, AspectExt_oneapi_external_semaphore_import, AspectExt_oneapi_mipmap, AspectExt_oneapi_mipmap_anisotropy, AspectExt_oneapi_mipmap_level_reference, - AspectExt_oneapi_bindless_sampled_image_fetch_3d, AspectExt_oneapi_cubemap, + AspectExt_oneapi_cubemap, AspectExt_oneapi_cubemap_seamless_filtering, AspectExt_oneapi_image_array, AspectExt_oneapi_unique_addressing_per_dim, @@ -140,6 +140,7 @@ def : TargetInfo<"__TestAspectList", AspectExt_oneapi_bindless_images_sample_2d_usm, AspectExt_oneapi_bindless_sampled_image_fetch_1d_usm, AspectExt_oneapi_bindless_sampled_image_fetch_1d, AspectExt_oneapi_bindless_sampled_image_fetch_2d_usm, AspectExt_oneapi_bindless_sampled_image_fetch_2d, + AspectExt_oneapi_bindless_sampled_image_fetch_3d, AspectExt_intel_esimd, AspectExt_oneapi_ballot_group, AspectExt_oneapi_fixed_size_group, AspectExt_oneapi_opportunistic_group, AspectExt_oneapi_tangle_group, AspectExt_intel_matrix, AspectExt_oneapi_is_composite, AspectExt_oneapi_is_component, @@ -220,8 +221,8 @@ defvar CudaMinAspects = !listconcat(AllUSMAspects, [AspectGpu, AspectFp64, Aspec AspectExt_oneapi_opportunistic_group, AspectExt_oneapi_graph, AspectExt_oneapi_limited_graph]); // Bindless images aspects are partially supported on CUDA and disabled by default at the moment. defvar CudaBindlessImagesAspects = [AspectExt_oneapi_bindless_images, AspectExt_oneapi_bindless_images_shared_usm, - AspectExt_oneapi_bindless_images_1d_usm, AspectExt_oneapi_bindless_images_2d_usm, AspectExt_oneapi_interop_memory_import, - AspectExt_oneapi_interop_semaphore_import, AspectExt_oneapi_mipmap, AspectExt_oneapi_mipmap_anisotropy, + AspectExt_oneapi_bindless_images_1d_usm, AspectExt_oneapi_bindless_images_2d_usm, AspectExt_oneapi_external_memory_import, + AspectExt_oneapi_external_semaphore_import, AspectExt_oneapi_mipmap, AspectExt_oneapi_mipmap_anisotropy, AspectExt_oneapi_mipmap_level_reference, AspectExt_oneapi_cubemap, AspectExt_oneapi_cubemap_seamless_filtering, AspectExt_oneapi_image_array, AspectExt_oneapi_unique_addressing_per_dim, AspectExt_oneapi_bindless_images_sample_2d_usm, AspectExt_oneapi_bindless_images_sample_2d_usm]; diff --git a/llvm/include/llvm/SYCLLowerIR/SpecConstants.h b/llvm/include/llvm/SYCLLowerIR/SpecConstants.h index bbd0213158d46..8bf8bdf894d07 100644 --- a/llvm/include/llvm/SYCLLowerIR/SpecConstants.h +++ b/llvm/include/llvm/SYCLLowerIR/SpecConstants.h @@ -59,7 +59,7 @@ class SpecConstantsPass : public PassInfoMixin { enum class HandlingMode { default_values, emulation, native }; public: - SpecConstantsPass(HandlingMode Mode) : Mode(Mode) {} + SpecConstantsPass(HandlingMode Mode = HandlingMode::emulation) : Mode(Mode) {} PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM); // Searches given module for occurrences of specialization constant-specific @@ -73,7 +73,7 @@ class SpecConstantsPass : public PassInfoMixin { std::vector &DefaultValues); private: - HandlingMode Mode = HandlingMode::emulation; + HandlingMode Mode; }; bool checkModuleContainsSpecConsts(const Module &M); diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 5bbaea52085e3..4c09bd60a8e65 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -131,6 +131,7 @@ #include "llvm/SYCLLowerIR/SYCLPropagateAspectsUsage.h" #include "llvm/SYCLLowerIR/SYCLPropagateJointMatrixUsage.h" #include "llvm/SYCLLowerIR/SYCLVirtualFunctionsAnalysis.h" +#include "llvm/SYCLLowerIR/SpecConstants.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index f306e77b43afe..9f4297d0522da 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -164,6 +164,7 @@ MODULE_PASS("lower-slm-reservation-calls", ESIMDLowerSLMReservationCalls()) MODULE_PASS("record-sycl-aspect-names", RecordSYCLAspectNamesPass()) MODULE_PASS("sycl-virtual-functions-analysis", SYCLVirtualFunctionsAnalysisPass()) +MODULE_PASS("spec-constants", SpecConstantsPass()) #undef MODULE_PASS #ifndef MODULE_PASS_WITH_PARAMS diff --git a/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp b/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp index 9506dcda2bcc1..a6609adce3429 100644 --- a/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp +++ b/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp @@ -1289,6 +1289,20 @@ translateSpirvGlobalUses(LoadInst *LI, StringRef SpirvGlobalName, } } +static void translateGlobalUse(Value *Use, StringRef SpirvGlobalName, + SmallVectorImpl &InstsToErase) { + LoadInst *LI = dyn_cast(Use); + ConstantExpr *CE = dyn_cast(Use); + GetElementPtrConstantExpr *GEPCE = dyn_cast(Use); + if (LI != nullptr) { + translateSpirvGlobalUses(LI, SpirvGlobalName, InstsToErase); + } else if (CE != nullptr || GEPCE != nullptr) { + for (User *U : (CE == nullptr ? GEPCE : CE)->users()) { + translateGlobalUse(U, SpirvGlobalName, InstsToErase); + } + } +} + static void createESIMDIntrinsicArgs(const ESIMDIntrinDesc &Desc, SmallVector &GenXArgs, CallInst &CI, id::FunctionEncoding *FE) { @@ -2090,6 +2104,18 @@ PreservedAnalyses SYCLLowerESIMDPass::run(Module &M, MPM.run(M, MAM); } + SmallVector ToErase; + constexpr size_t PrefLen = StringRef(SPIRV_INTRIN_PREF).size(); + for (GlobalVariable &Global : M.globals()) { + if (!Global.getName().starts_with(SPIRV_INTRIN_PREF)) + continue; + + for (User *U : Global.users()) + translateGlobalUse(U, Global.getName().drop_front(PrefLen), ToErase); + } + for (auto *CI : ToErase) + CI->eraseFromParent(); + generateKernelMetadata(M); // This function needs to run after generateKernelMetadata, as it // uses the generated metadata: @@ -2244,37 +2270,6 @@ size_t SYCLLowerESIMDPass::runOnFunction(Function &F, // this is ESIMD intrinsic - record for later translation ESIMDIntrCalls.push_back(CI); } - - // Translate loads from SPIRV builtin globals into GenX intrinsics - auto *LI = dyn_cast(&I); - if (LI) { - Value *LoadPtrOp = LI->getPointerOperand(); - Value *SpirvGlobal = nullptr; - // Look through constant expressions to find SPIRV builtin globals - // It may come with or without cast. - auto *CE = dyn_cast(LoadPtrOp); - auto *GEPCE = dyn_cast(LoadPtrOp); - if (GEPCE) { - SpirvGlobal = GEPCE->getOperand(0); - } else if (CE) { - assert(CE->isCast() && "ConstExpr should be a cast"); - SpirvGlobal = CE->getOperand(0); - } else { - SpirvGlobal = LoadPtrOp; - } - - if (!isa(SpirvGlobal) || - !SpirvGlobal->getName().starts_with(SPIRV_INTRIN_PREF)) - continue; - - auto PrefLen = StringRef(SPIRV_INTRIN_PREF).size(); - - // Translate all uses of the load instruction from SPIRV builtin global. - // Replaces the original global load and it is uses and stores the old - // instructions to ToErase. - translateSpirvGlobalUses(LI, SpirvGlobal->getName().drop_front(PrefLen), - ToErase); - } } // Now demangle and translate found ESIMD intrinsic calls for (auto *CI : ESIMDIntrCalls) { diff --git a/llvm/lib/SYCLLowerIR/SpecConstants.cpp b/llvm/lib/SYCLLowerIR/SpecConstants.cpp index 58f5a0d54b26e..4f43a22e95fd9 100644 --- a/llvm/lib/SYCLLowerIR/SpecConstants.cpp +++ b/llvm/lib/SYCLLowerIR/SpecConstants.cpp @@ -20,6 +20,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Operator.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/TargetParser/Triple.h" #include @@ -101,12 +102,16 @@ StringRef getStringLiteralArg(const CallInst *CI, unsigned ArgNo, // so that %1 is trivially known to be the address of the @.str literal. Value *TmpPtr = L->getPointerOperand(); - AssertRelease((isa(TmpPtr) && - isa(cast(TmpPtr) - ->getPointerOperand() - ->stripPointerCasts())) || - isa(TmpPtr), - "unexpected instruction type"); + auto ValueIsAlloca = [](Value *V) { + if (auto *ASC = dyn_cast(V)) + V = ASC->getPointerOperand()->stripPointerCasts(); + using namespace PatternMatch; + Value *X; + if (match(V, m_IntToPtr(m_Add(m_PtrToInt(m_Value(X)), m_ConstantInt())))) + V = X; + return isa(V); + }; + AssertRelease(ValueIsAlloca(TmpPtr), "unexpected instruction type"); // find the store of the literal address into TmpPtr StoreInst *Store = nullptr; diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index f500440e02706..4ee247821fb11 100644 --- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -1042,6 +1042,7 @@ struct FunctionStackPoisoner : public InstVisitor { FunctionCallee AsanStackMallocFunc[kMaxAsanStackMallocSizeClass + 1], AsanStackFreeFunc[kMaxAsanStackMallocSizeClass + 1]; FunctionCallee AsanSetShadowFunc[0x100] = {}; + FunctionCallee AsanSetShadowPrivateFunc; FunctionCallee AsanPoisonStackMemoryFunc, AsanUnpoisonStackMemoryFunc; FunctionCallee AsanAllocaPoisonFunc, AsanAllocasUnpoisonFunc; @@ -1257,10 +1258,11 @@ struct FunctionStackPoisoner : public InstVisitor { // ShadowMask is not zero. If ShadowMask[i] is zero, we assume that // ShadowBytes[i] is constantly zero and doesn't need to be overwritten. void copyToShadow(ArrayRef ShadowMask, ArrayRef ShadowBytes, - IRBuilder<> &IRB, Value *ShadowBase); + IRBuilder<> &IRB, Value *ShadowBase, + bool ForceOutline = false); void copyToShadow(ArrayRef ShadowMask, ArrayRef ShadowBytes, size_t Begin, size_t End, IRBuilder<> &IRB, - Value *ShadowBase); + Value *ShadowBase, bool ForceOutline = false); void copyToShadowInline(ArrayRef ShadowMask, ArrayRef ShadowBytes, size_t Begin, size_t End, IRBuilder<> &IRB, Value *ShadowBase); @@ -3593,6 +3595,9 @@ void FunctionStackPoisoner::initializeCallbacks(Module &M) { AsanSetShadowFunc[Val] = M.getOrInsertFunction(Name.str(), IRB.getVoidTy(), IntptrTy, IntptrTy); } + AsanSetShadowPrivateFunc = + M.getOrInsertFunction("__asan_set_shadow_private", IRB.getVoidTy(), + IntptrTy, IntptrTy, IRB.getInt8Ty()); AsanAllocaPoisonFunc = M.getOrInsertFunction( kAsanAllocaPoison, IRB.getVoidTy(), IntptrTy, IntptrTy); @@ -3655,14 +3660,17 @@ void FunctionStackPoisoner::copyToShadowInline(ArrayRef ShadowMask, void FunctionStackPoisoner::copyToShadow(ArrayRef ShadowMask, ArrayRef ShadowBytes, - IRBuilder<> &IRB, Value *ShadowBase) { - copyToShadow(ShadowMask, ShadowBytes, 0, ShadowMask.size(), IRB, ShadowBase); + IRBuilder<> &IRB, Value *ShadowBase, + bool ForceOutline) { + copyToShadow(ShadowMask, ShadowBytes, 0, ShadowMask.size(), IRB, ShadowBase, + ForceOutline); } void FunctionStackPoisoner::copyToShadow(ArrayRef ShadowMask, ArrayRef ShadowBytes, size_t Begin, size_t End, - IRBuilder<> &IRB, Value *ShadowBase) { + IRBuilder<> &IRB, Value *ShadowBase, + bool ForceOutline) { assert(ShadowMask.size() == ShadowBytes.size()); size_t Done = Begin; for (size_t i = Begin, j = Begin + 1; i < End; i = j++) { @@ -3671,14 +3679,20 @@ void FunctionStackPoisoner::copyToShadow(ArrayRef ShadowMask, continue; } uint8_t Val = ShadowBytes[i]; - if (!AsanSetShadowFunc[Val]) + if (!AsanSetShadowFunc[Val] && !ForceOutline) continue; // Skip same values. for (; j < End && ShadowMask[j] && Val == ShadowBytes[j]; ++j) { } - if (j - i >= ASan.MaxInlinePoisoningSize) { + if (ForceOutline) { + RTCI.createRuntimeCall( + IRB, AsanSetShadowPrivateFunc, + {IRB.CreateAdd(ShadowBase, ConstantInt::get(IntptrTy, i)), + ConstantInt::get(IntptrTy, j - i), + ConstantInt::get(IRB.getInt8Ty(), Val)}); + } else if (j - i >= ASan.MaxInlinePoisoningSize) { copyToShadowInline(ShadowMask, ShadowBytes, Done, i, IRB, ShadowBase); RTCI.createRuntimeCall( IRB, AsanSetShadowFunc[Val], @@ -3688,7 +3702,8 @@ void FunctionStackPoisoner::copyToShadow(ArrayRef ShadowMask, } } - copyToShadowInline(ShadowMask, ShadowBytes, Done, End, IRB, ShadowBase); + if (!ForceOutline) + copyToShadowInline(ShadowMask, ShadowBytes, Done, End, IRB, ShadowBase); } // Fake stack allocator (asan_fake_stack.h) has 11 size classes @@ -4062,7 +4077,8 @@ void FunctionStackPoisoner::processStaticAllocas() { ASan.memToShadow(LocalStackBase, IRB, kSpirOffloadPrivateAS); // As mask we must use most poisoned case: red zones and after scope. // As bytes we can use either the same or just red zones only. - copyToShadow(ShadowAfterScope, ShadowAfterScope, IRB, ShadowBase); + copyToShadow(ShadowAfterScope, ShadowAfterScope, IRB, ShadowBase, + TargetTriple.isSPIROrSPIRV()); if (!StaticAllocaPoisonCallVec.empty()) { const auto &ShadowInScope = GetShadowBytes(SVD, L); @@ -4132,7 +4148,8 @@ void FunctionStackPoisoner::processStaticAllocas() { IRBuilder<> IRBElse(ElseTerm); copyToShadow(ShadowAfterScope, ShadowClean, IRBElse, ShadowBase); } else { - copyToShadow(ShadowAfterScope, ShadowClean, IRBRet, ShadowBase); + copyToShadow(ShadowAfterScope, ShadowClean, IRBRet, ShadowBase, + TargetTriple.isSPIROrSPIRV()); } } diff --git a/llvm/test/Instrumentation/AddressSanitizer/SPIRV/instrument_private_address_space.ll b/llvm/test/Instrumentation/AddressSanitizer/SPIRV/instrument_private_address_space.ll index bf412b8225b79..88c0bf21ed54f 100644 --- a/llvm/test/Instrumentation/AddressSanitizer/SPIRV/instrument_private_address_space.ll +++ b/llvm/test/Instrumentation/AddressSanitizer/SPIRV/instrument_private_address_space.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=0 -asan-stack=0 -asan-globals=0 -asan-constructor-kind=none -asan-spir-privates=1 -asan-use-after-return=never -S | FileCheck %s +; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=0 -asan-stack=0 -asan-globals=0 -asan-constructor-kind=none -asan-mapping-scale=4 -asan-spir-privates=1 -asan-use-after-return=never -S | FileCheck %s target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" target triple = "spir64-unknown-unknown" @@ -21,11 +21,15 @@ define spir_kernel void @kernel() #0 { entry: %p.i = alloca [4 x i32], align 4 ; CHECK: %shadow_ptr = call i64 @__asan_mem_to_shadow(i64 %0, i32 0) + ; CHECK: call void @__asan_set_shadow_private(i64 %4, i64 2, i8 -15) + ; CHECK: call void @__asan_set_shadow_private(i64 %5, i64 1, i8 -13) call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %p.i) call void @llvm.memcpy.p0.p1.i64(ptr align 4 %p.i, ptr addrspace(1) align 4 @__const._ZZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_ENKUlvE_clEv.p, i64 16, i1 false) %arraydecay.i = getelementptr inbounds [4 x i32], ptr %p.i, i64 0, i64 0 %0 = addrspacecast ptr %arraydecay.i to ptr addrspace(4) %call.i = call spir_func i32 @_Z3fooPii(ptr addrspace(4) %0) + ; CHECK: call void @__asan_set_shadow_private(i64 %7, i64 2, i8 0) + ; CHECK: call void @__asan_set_shadow_private(i64 %8, i64 1, i8 0) ret void } diff --git a/llvm/test/SYCLLowerIR/SpecConstants/literal-address-alloca-asan.ll b/llvm/test/SYCLLowerIR/SpecConstants/literal-address-alloca-asan.ll new file mode 100644 index 0000000000000..1b904abfa0f3a --- /dev/null +++ b/llvm/test/SYCLLowerIR/SpecConstants/literal-address-alloca-asan.ll @@ -0,0 +1,33 @@ +; RUN: opt -passes=spec-constants %s -S -o - | FileCheck %s + +; Check there is no assert error when literal address is loaded from an alloca +; with offset. + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G1" +target triple = "spir64-unknown-unknown" + +%"class.sycl::_V1::specialization_id" = type { i32 } + +@_ZL9test_id_1 = addrspace(1) constant %"class.sycl::_V1::specialization_id" { i32 42 } +@__usid_str = constant [36 x i8] c"uide7faddc6b4d2fe92____ZL9test_id_1\00" + +define spir_func void @_ZZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_ENKUlNS0_14kernel_handlerEE_clES4_(ptr addrspace(4) %this1.i7) { +entry: + %MyAlloca = alloca i8, i64 224, align 32 + %0 = ptrtoint ptr %MyAlloca to i64 + %1 = add i64 %0, 96 + %2 = inttoptr i64 %1 to ptr + %SymbolicID.ascast.i = addrspacecast ptr %2 to ptr addrspace(4) + store ptr addrspace(4) addrspacecast (ptr @__usid_str to ptr addrspace(4)), ptr addrspace(4) %SymbolicID.ascast.i, align 8 + %3 = load ptr addrspace(4), ptr addrspace(4) %SymbolicID.ascast.i, align 8 + %4 = load ptr addrspace(4), ptr addrspace(4) %this1.i7, align 8 + +; CHECK-NOT: call spir_func noundef i32 @_Z37__sycl_getScalar2020SpecConstantValueIiET_PKcPKvS4_( +; CHECK: %conv = sitofp i32 %load to double + + %call.i8 = call spir_func i32 @_Z37__sycl_getScalar2020SpecConstantValueIiET_PKcPKvS4_(ptr addrspace(4) %3, ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZL9test_id_1 to ptr addrspace(4)), ptr addrspace(4) %4) + %conv = sitofp i32 %call.i8 to double + ret void +} + +declare spir_func i32 @_Z37__sycl_getScalar2020SpecConstantValueIiET_PKcPKvS4_(ptr addrspace(4), ptr addrspace(4), ptr addrspace(4)) diff --git a/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp b/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp index b1f54ee21b78d..3637930d72f8f 100644 --- a/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp +++ b/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #define DEBUG_TYPE "sycl-spec-const-materializer" @@ -298,9 +299,8 @@ PreservedAnalyses SYCLSpecConstMaterializer::run(Function &F, // Invariant: This pass is only intended to operate on SYCL kernels being // compiled to either `nvptx{,64}-nvidia-cuda`, or `amdgcn-amd-amdhsa` // triples. - auto AT = TargetHelpers::getArchType(*Mod); - if (TargetHelpers::ArchType::Cuda != AT && - TargetHelpers::ArchType::AMDHSA != AT) { + Triple T(Mod->getTargetTriple()); + if (!T.isNVPTX() && !T.isAMDGCN()) { LLVM_DEBUG(dbgs() << "Unsupported architecture\n"); return PreservedAnalyses::all(); } diff --git a/sycl-fusion/test/lit.cfg.py b/sycl-fusion/test/lit.cfg.py index d92326b020ce3..fffa59585ef0e 100644 --- a/sycl-fusion/test/lit.cfg.py +++ b/sycl-fusion/test/lit.cfg.py @@ -25,6 +25,6 @@ config.substitutions.append(("%shlibdir", config.llvm_shlib_dir)) if "NVPTX" in config.llvm_targets_to_build: - config.available_features.add('cuda') + config.available_features.add("cuda") if "AMDGPU" in config.llvm_targets_to_build: - config.available_features.add('hip_amd') + config.available_features.add("hip_amd") diff --git a/sycl/cmake/modules/FetchUnifiedRuntime.cmake b/sycl/cmake/modules/FetchUnifiedRuntime.cmake index 35a9142059418..a5a70600e9002 100644 --- a/sycl/cmake/modules/FetchUnifiedRuntime.cmake +++ b/sycl/cmake/modules/FetchUnifiedRuntime.cmake @@ -30,6 +30,7 @@ option(SYCL_UMF_DISABLE_HWLOC set(UR_BUILD_EXAMPLES OFF CACHE BOOL "Build example applications." FORCE) set(UR_BUILD_TESTS OFF CACHE BOOL "Build unit tests." FORCE) set(UR_BUILD_XPTI_LIBS OFF) +set(UR_ENABLE_SYMBOLIZER ON CACHE BOOL "Enable symbolizer for sanitizer layer.") set(UR_ENABLE_TRACING ON) if("level_zero" IN_LIST SYCL_ENABLE_PLUGINS) @@ -115,14 +116,14 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT) CACHE PATH "Path to external '${name}' adapter source dir" FORCE) endfunction() - set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") - # commit a985a81dc9ba8adfcc8b54e35ad287e97766fb3e - # Merge: b7b0c8b3 f772f907 - # Author: Piotr Balcer - # Date: Mon Jul 29 09:11:29 2024 +0200 - # Merge pull request #1905 from igchor/umf_hwloc_disable - # Bump UMF version to allow disabling hwloc - set(UNIFIED_RUNTIME_TAG a985a81dc9ba8adfcc8b54e35ad287e97766fb3e) +set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") + # commit 3e762e00bcf13d158fb58e8e8c2eabcfc8934b4e + # Merge: c805a71a a2a053de + # Author: Omar Ahmed + # Date: Wed Jul 31 12:26:34 2024 +0100 + # Merge pull request #1884 from callumfare/callum/fix_printtrace + # Enable PrintTrace when SYCL UR tracing is enabled + set(UNIFIED_RUNTIME_TAG 3e762e00bcf13d158fb58e8e8c2eabcfc8934b4e) set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES") # Due to the use of dependentloadflag and no installer for UMF and hwloc we need diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc index 866ac03436ab3..1a7a9062885db 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc @@ -1591,9 +1591,9 @@ The device aspect descriptors for these queries are: [frame="none",options="header"] |====================== |Device descriptor |Description -|`aspect::ext_oneapi_interop_memory_import` | Indicates if the device supports +|`aspect::ext_oneapi_external_memory_import` | Indicates if the device supports importing external memory resources. -|`aspect::ext_oneapi_interop_semaphore_import`` | Indicates if the device +|`aspect::ext_oneapi_external_semaphore_import`` | Indicates if the device supports importing external semaphore resources. |====================== @@ -1687,35 +1687,35 @@ resource type. ```cpp namespace sycl::ext::oneapi::experimental { -struct interop_mem_handle { +struct external_mem { using raw_handle_type = /* Implementation defined */; raw_handle_type raw_handle; }; template -interop_mem_handle import_external_memory( +external_mem import_external_memory( external_mem_descriptor externalMemDescriptor, const sycl::device &syclDevice, const sycl::context &syclContext); template -interop_mem_handle import_external_memory( +external_mem import_external_memory( external_mem_descriptor externalMemDescriptor, const sycl::queue &syclQueue); image_mem_handle map_external_image_memory( - interop_mem_handle interopMemHandle, + external_mem externalMemHandle, const image_descriptor &imageDescriptor, const sycl::device &syclDevice, const sycl::context &syclContext); image_mem_handle map_external_image_memory( - interop_mem_handle interopMemHandle, + external_mem externalMemHandle, const image_descriptor &imageDescriptor, const sycl::queue &syclQueue); } ``` -The resulting `interop_mem_handle` can then be mapped, where the resulting type +The resulting `external_mem` can then be mapped, where the resulting type is an `image_mem_handle`. This can be used to construct images in the same way as memory allocated through `alloc_image_mem`. The `ext_oneapi_copy` operations also work with imported memory mapped to `image_mem_handle` types. @@ -1734,16 +1734,16 @@ behaviour. Once a user has finished operating on imported memory, they must ensure that they destroy the imported memory handle through `release_external_memory`. -`release_external_memory` can only accept `interop_mem_handles` that were +`release_external_memory` can only accept `external_mem` objects that were created through `import_external_memory`. ```cpp namespace sycl::ext::oneapi::experimental { -void release_external_memory(interop_mem_handle interopMem, +void release_external_memory(external_mem externalMem, const sycl::device &syclDevice, const sycl::context &syclContext); -void release_external_memory(interop_mem_handle interopMem, +void release_external_memory(external_mem externalMem, const sycl::queue &syclQueue); } ``` @@ -1807,27 +1807,27 @@ compatible with the `resource_fd` resource type. ```cpp namespace sycl::ext::oneapi::experimental { -struct interop_semaphore_handle { +struct external_semaphore { using raw_handle_type = /* Implementation defined */; raw_handle_type raw_handle; }; template -interop_semaphore_handle import_external_semaphore( +external_semaphore import_external_semaphore( external_semaphore_descriptor externalSemaphoreDescriptor, const sycl::device &syclDevice, const sycl::context &syclContext); template -interop_semaphore_handle import_external_semaphore( +external_semaphore import_external_semaphore( external_semaphore_descriptor externalSemaphoreDescriptor, const sycl::queue &syclQueue); } ``` -The resulting `interop_semaphore_handle` can then be used in a SYCL command +The resulting `external_semaphore` can then be used in a SYCL command group, to either wait until the semaphore signalled, or signal the semaphore. If the type of semaphore imported supports setting the state of discrete @@ -1843,77 +1843,77 @@ namespace sycl { class handler { public: void ext_oneapi_wait_external_semaphore( - ext::oneapi::experimental::interop_semaphore_handle - interop_semaphore_handle); + ext::oneapi::experimental::external_semaphore + external_semaphore); void ext_oneapi_wait_external_semaphore( - ext::oneapi::experimental::interop_semaphore_handle - interop_semaphore_handle, + ext::oneapi::experimental::external_semaphore + external_semaphore, uint64_t wait_value); void ext_oneapi_signal_external_semaphore( - ext::oneapi::experimental::interop_semaphore_handle - interop_semaphore_handle); + ext::oneapi::experimental::external_semaphore + external_semaphore); void ext_oneapi_signal_external_semaphore( - ext::oneapi::experimental::interop_semaphore_handle - interop_semaphore_handle, + ext::oneapi::experimental::external_semaphore + external_semaphore, uint64_t signal_value); }; class queue { public: event ext_oneapi_wait_external_semaphore( - ext::oneapi::experimental::interop_semaphore_handle - interop_semaphore_handle); + ext::oneapi::experimental::external_semaphore + external_semaphore); event ext_oneapi_wait_external_semaphore( - ext::oneapi::experimental::interop_semaphore_handle - interop_semaphore_handle, + ext::oneapi::experimental::external_semaphore + external_semaphore, event DepEvent); event ext_oneapi_wait_external_semaphore( - ext::oneapi::experimental::interop_semaphore_handle - interop_semaphore_handle, + ext::oneapi::experimental::external_semaphore + external_semaphore, const std::vector &DepEvents); event ext_oneapi_wait_external_semaphore( - ext::oneapi::experimental::interop_semaphore_handle - interop_semaphore_handle, + ext::oneapi::experimental::external_semaphore + external_semaphore, uint64_t wait_value); event ext_oneapi_wait_external_semaphore( - ext::oneapi::experimental::interop_semaphore_handle - interop_semaphore_handle, + ext::oneapi::experimental::external_semaphore + external_semaphore, uint64_t wait_value, event DepEvent); event ext_oneapi_wait_external_semaphore( - ext::oneapi::experimental::interop_semaphore_handle - interop_semaphore_handle, + ext::oneapi::experimental::external_semaphore + external_semaphore, uint64_t wait_value, const std::vector &DepEvents); event ext_oneapi_signal_external_semaphore( - ext::oneapi::experimental::interop_semaphore_handle - interop_semaphore_handle); + ext::oneapi::experimental::external_semaphore + external_semaphore); event ext_oneapi_signal_external_semaphore( - ext::oneapi::experimental::interop_semaphore_handle - interop_semaphore_handle, + ext::oneapi::experimental::external_semaphore + external_semaphore, event DepEvent); event ext_oneapi_signal_external_semaphore( - ext::oneapi::experimental::interop_semaphore_handle - interop_semaphore_handle, + ext::oneapi::experimental::external_semaphore + external_semaphore, const std::vector &DepEvents); event ext_oneapi_signal_external_semaphore( - ext::oneapi::experimental::interop_semaphore_handle - interop_semaphore_handle, + ext::oneapi::experimental::external_semaphore + external_semaphore, uint64_t signal_value); event ext_oneapi_signal_external_semaphore( - ext::oneapi::experimental::interop_semaphore_handle - interop_semaphore_handle, + ext::oneapi::experimental::external_semaphore + external_semaphore, uint64_t signal_value, event DepEvent); event ext_oneapi_signal_external_semaphore( - ext::oneapi::experimental::interop_semaphore_handle - interop_semaphore_handle, + ext::oneapi::experimental::external_semaphore + external_semaphore, uint64_t signal_value, const std::vector &DepEvents); }; @@ -1952,11 +1952,11 @@ access the external semaphore once they are no longer required through ```cpp namespace sycl::ext::oneapi::experimental { -void release_external_semaphore(interop_semaphore_handle semaphoreHandle, +void release_external_semaphore(external_semaphore semaphoreHandle, const sycl::device &syclDevice, const sycl::context &syclContext); -void release_external_semaphore(interop_semaphore_handle semaphoreHandle, +void release_external_semaphore(external_semaphore semaphoreHandle, const sycl::queue &syclQueue); } @@ -2547,34 +2547,34 @@ sycl::ext::oneapi::experimental::external_semaphore_descriptor< try { // Extension: import external semaphores - sycl::ext::oneapi::experimental::interop_semaphore_handle - wait_interop_semaphore_handle = + sycl::ext::oneapi::experimental::external_semaphore + wait_external_semaphore = sycl::ext::oneapi::experimental::import_external_semaphore( wait_external_semaphore_desc, queue); - sycl::ext::oneapi::experimental::interop_semaphore_handle - done_interop_semaphore_handle = + sycl::ext::oneapi::experimental::external_semaphore + done_external_semaphore = sycl::ext::oneapi::experimental::import_external_semaphore( done_external_semaphore_desc, queue); // Extension: import external memory from descriptors - sycl::ext::oneapi::experimental::interop_mem_handle - input_interop_mem_handle = + sycl::ext::oneapi::experimental::external_mem + input_external_mem = sycl::ext::oneapi::experimental::import_external_memory( input_ext_mem_desc, queue); - sycl::ext::oneapi::experimental::interop_mem_handle - output_interop_mem_handle = + sycl::ext::oneapi::experimental::external_mem + output_external_mem = sycl::ext::oneapi::experimental::import_external_memory( output_ext_mem_desc, queue); // Extension: map imported external memory to image memory sycl::ext::oneapi::experimental::image_mem_handle input_mapped_mem_handle = sycl::ext::oneapi::experimental::map_external_image_memory( - input_interop_mem_handle, desc, queue); + input_external_mem, desc, queue); sycl::ext::oneapi::experimental::image_mem_handle output_mapped_mem_handle = sycl::ext::oneapi::experimental::map_external_image_memory( - output_interop_mem_handle, desc, queue); + output_external_mem, desc, queue); // Extension: create images from mapped memory and return the handles sycl::ext::oneapi::experimental::unsampled_image_handle img_input = @@ -2585,7 +2585,7 @@ try { output_mapped_mem_handle, desc, queue); // Extension: wait for imported semaphore - q.ext_oneapi_wait_external_semaphore(wait_interop_semaphore_handle) + q.ext_oneapi_wait_external_semaphore(wait_external_semaphore) // Submit our kernel that depends on imported "wait_semaphore_file_descriptor" q.submit([&](sycl::handler &cgh) { @@ -2610,7 +2610,7 @@ try { }); // Extension: signal imported semaphore - q.ext_oneapi_signal_external_semaphore(done_interop_semaphore_handle) + q.ext_oneapi_signal_external_semaphore(done_external_semaphore) // The external API can now use the semaphore it exported to // "done_semaphore_file_descriptor" to schedule its own command submissions @@ -2619,13 +2619,13 @@ try { // Extension: destroy all external resources sycl::ext::oneapi::experimental::release_external_memory( - input_interop_mem_handle, queue); + input_external_mem, queue); sycl::ext::oneapi::experimental::release_external_memory( - output_interop_mem_handle, queue); + output_external_mem, queue); sycl::ext::oneapi::experimental::release_external_semaphore( - wait_interop_semaphore_handle, queue); + wait_external_semaphore, queue); sycl::ext::oneapi::experimental::release_external_semaphore( - done_interop_semaphore_handle, queue); + done_external_semaphore, queue); sycl::ext::oneapi::experimental::destroy_image_handle(img_input, queue); sycl::ext::oneapi::experimental::destroy_image_handle(img_output, queue); } catch (sycl::exception e) { @@ -2859,4 +2859,11 @@ These features still need to be handled: 3D USM images as they are not supported on any platform. - Refine the description of `ext_oneapi_bindless_images` aspect to indicate support for bindless image APIs. +|5.16|2024-07-24| - Renamed interop aspect queries, handles, semaphore wait and + signal functions, by replacing `interop` with `external` for + consistency with other interop related structs/funcs and + 3rd party interop API naming. + - Removed `handle` keyword from `interop_xxx_handle` to + clear up possible confusion between 3rd party interop + handles and the imported `interop_xxx_handle`. |====================== diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc index a39c1d1c1884b..8e37c76ecc16a 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc @@ -556,6 +556,114 @@ Parameters: |=== +==== Dynamic Command Groups + +[source,c++] +---- +namespace ext::oneapi::experimental { +class dynamic_command_group { +public: + dynamic_command_group( + command_graph graph, + const std::vector>& cgfList); + + size_t get_active_cgf(); + void set_active_cgf(size_t cgfIndex); +}; +---- + +Dynamic command-groups can be added as nodes to a graph. They provide a mechanism that +allows updating the command-group function of a node after the graph is finalized. +There is always one command-group function in the dynamic command-group that is set +as active. When a dynamic command-group node is executed, the kernel of the active +command-group function will be run and all the other command-group functions in +`cgfList` will be ignored. + +See <> for more information +about updating command-groups. + +===== Limitations + +Dynamic command-groups can only be used to update kernels. Trying to update a command-group +function that contains other operations will result in an error. + +All the command-group functions in a dynamic command-group must have identical dependencies. +It is not allowed for a dynamic command-group to have command-group functions that would +result in a change to the graph topology when set to active. In practice, this means that +any calls to `handler.depends_on()` must be identical for all the command-group functions +in a dynamic command-group. + +Table {counter: tableNumber}. Member functions of the `dynamic_command_group` class. +[cols="2a,a"] +|=== +|Member Function|Description + +| +[source,c++] +---- +dynamic_command_group( +command_graph graph, +const std::vector>& cgfList); +---- + +|Constructs a dynamic command-group object that can be added as a node to a `command_graph`. + +Parameters: + +* `graph` - Graph to be associated with this `dynamic_command_group`. +* `cgfList` - The list of command-group functions that can be activated for this dynamic command-group. + The command-group function at index 0 will be active by default. + +Exceptions: + +* Throws synchronously with error code `invalid` if the graph wasn't created with + the `property::graph::assume_buffer_outlives_graph` property and the `dynamic_command_group` + is created with command-group functions that use buffers. See the + <> + property for more information. + +* Throws with error code `invalid` if the `dynamic_command_group` is created with + command-group functions that are not kernel executions. + +* Throws with error code `invalid` if the command-group functions in `cgfList` have + event dependencies that are incompatible with each other and would result in + different graph topologies when set to active. + +| +[source,c++] +---- +size_t get_active_cgf(); +---- +|Returns the index of the currently active command-group function in this +`dynamic_command_group`. + +| +[source,c++] +---- +void set_active_cgf(size_t cgfIndex); +---- +| Sets the command-group function with index `cgfIndex` as active. The index of the +command-group function in a `dynamic_command_group` is identical to its index in the +`cgfList` vector when it was passed to the `dynamic_command_group` constructor. + +This change will be reflected immediately in the modifiable graph which contains this +`dynamic_command_group`. The new value will not be reflected in any executable graphs +created from that modifiable graph until `command_graph::update()` is called, passing +the modified nodes, or a new executable graph is finalized from the modifiable graph. + +Setting `cgfIndex` to the index of the currently active command-group function is +a no-op. + +Parameters: + +* `cgfIndex` - The index of the command-group function that should be set as active. + +Exceptions: + +* Throw with error code `invalid` if `cgfIndex` is not a valid index. + +|=== + ==== Depends-On Property [source,c++] @@ -631,6 +739,8 @@ public: template node add(T cgf, const property_list& propList = {}); + node add(dynamic_command_group& dynamicCG, const property_list& propList = {}); + void make_edge(node& src, node& dest); void print_graph(std::string path, bool verbose = false) const; @@ -711,21 +821,39 @@ Updates to a graph will be scheduled after any in-flight executions of the same graph and will not affect previous submissions of the same graph. The user is not required to wait on any previous submissions of a graph before updating it. -The only type of nodes that are currently able to be updated in a graph are -kernel execution nodes. - -The aspects of a kernel execution node that can be configured during update are: - -* Parameters to the kernel. -* Execution ND-Range of the kernel. - To update an executable graph, the `property::graph::updatable` property must have been set when the graph was created during finalization. Otherwise, an exception will be thrown if a user tries to update an executable graph. This guarantee allows the backend to provide a more optimized implementation, if possible. -===== Individual Node Update +===== Supported Features + +The only types of nodes that are currently able to be updated in a graph are +kernel execution nodes. + +There are two different API's that can be used to update a graph: + +* <> which allows updating +individual nodes of a command-graph. +* <> which allows updating the +entirety of the graph simultaneously by using another graph as a +reference. + +The aspects of a kernel execution node that can be changed during update are +different depending on the API used to perform the update: + +* For the <> API it's possible to update +the kernel function, the parameters to the kernel, and the ND-Range. +* For the <> API, only the parameters of the kernel +and the ND-Range can be updated. + +===== Individual Node Update [[individual-node-update]] + +Individual nodes of an executable graph can be updated directly. Depending on the attribute +of the node that requires updating, different API's should be used: + +====== Parameter Updates Parameters to individual nodes in a graph in the `executable` state can be updated between graph executions using dynamic parameters. A `dynamic_parameter` @@ -739,14 +867,6 @@ Parameter updates are performed using a `dynamic_parameter` instance by calling not registered, even if they use the same parameter value as a `dynamic_parameter`. -The other node configuration that can be updated is the execution range of the -kernel, this can be set through `node::update_nd_range()` or -`node::update_range()` but does not require any prior registration. - -The executable graph can then be updated by passing the updated nodes to -`command_graph::update(node& node)` or -`command_graph::update(const std::vector& nodes)`. - Since the structure of the graph became fixed when finalizing, updating parameters on a node will not change the already defined dependencies between nodes. This is important to note when updating buffer parameters to a node, @@ -762,6 +882,41 @@ dynamic parameter for the buffer can be registered with all the nodes which use the buffer as a parameter. Then a single `dynamic_parameter::update()` call will maintain the graphs data dependencies. +====== Execution Range Updates + +Another configuration that can be updated is the execution range of the +kernel, this can be set through `node::update_nd_range()` or +`node::update_range()` but does not require any prior registration. + +An alternative way to update the execution range of a node is to do so while +updating command groups as described in the next section. + +====== Command Group Updates + +The command-groups of a kernel node can be updated using dynamic command-groups. +Dynamic command-groups allow replacing the command-group function of a kernel +node with a different one. This effectively allows updating the kernel function +and/or the kernel execution range. + +Command-group updates are performed by creating an instance of the +`dynamic_command_group` class. A dynamic command-group is created with a modifiable +state graph and a list of possible command-group functions. Command-group functions +within a dynamic command-group can then be set to active by using the member function +`dynamic_command_group::set_active_cgf()`. + +Dynamic command-groups are compatible with dynamic parameters. This means that +dynamic parameters can be used in command-group functions that are part of +dynamic command-groups. Updates to such dynamic parameters will be reflected +in the command-group functions once they are activated. + +====== Committing Updates + +Updating a node using the methods mentioned above will take effect immediately +for nodes in modifiable command-graphs. However, for graphs that are in the executable +state, in order to commit the update, the updated nodes must be passed to +`command_graph::update(node& node)` or +`command_graph::update(const std::vector& nodes)`. + ===== Whole Graph Update [[whole-graph-update]] A graph in the executable state can have all of its nodes updated using the @@ -1042,6 +1197,42 @@ Exceptions: | [source,c++] ---- +node add(dynamic_command_group& dynamicCG, const property_list& propList = {}); +---- + +| Adds the dynamic command-group `dynamicCG` as a node to the graph and sets the +current active command-group function in `dynamicCG` as the executable for future +executions of this graph node. + +The current active command-group function in `dynamicCG` will be executed asynchronously +when the graph is submitted to a queue. The requisites of this command-group +function will be used to identify any dependent nodes in the graph +to form edges with. The other command-group functions in `dynamicCG` will be captured +into the graph but will not be executed in a graph submission unless they are +set to active. + +Constraints: + +* This member function is only available when the `command_graph` state is + `graph_state::modifiable`. + +Parameters: + +* `dynamicCG` - Dynamic command-group object to be added as a node. + +* `propList` - Zero or more properties can be provided to the constructed node + via an instance of `property_list`. The `property::node::depends_on` property + can be passed here with a list of nodes to create dependency edges on. + +Returns: The dynamic command-group object node which has been added to the graph. + +Exceptions: + +* Throws synchronously with error code `invalid` if a queue is recording + commands to the graph. +| +[source,c++] +---- void make_edge(node& src, node& dest); ---- @@ -1157,8 +1348,9 @@ void update(node& node); ---- | Updates an executable graph node that corresponds to `node`. `node` must be a -kernel execution node. Kernel arguments and the ND-range of the node will be -updated inside the executable graph to reflect the current values in `node`. +kernel execution node. The command-group function of the node will be updated, +inside the executable graph, to reflect the current values in `node`. This +includes the kernel function, the kernel nd-range and the kernel parameters. Updating these values will not change the structure of the graph. @@ -1190,9 +1382,10 @@ void update(const std::vector& nodes); ---- | Updates all executable graph nodes that corresponds to the nodes contained in -`nodes`. All nodes must be kernel nodes. Kernel arguments and the ND-range of -each node will be updated inside the executable graph to reflect the current -values in each node in `nodes`. +`nodes`. All nodes must be kernel nodes. The command-group function of each node +will be updated, inside the executable graph, to reflect the current values in +`nodes`. This includes the kernel function, the kernel nd-range and the kernel +parameters". Updating these values will not change the structure of the graph. @@ -1712,6 +1905,10 @@ the call to `queue::submit()` or `command_graph::add()` along with the calls to handler functions and this will not be reflected on future executions of the graph. +Similarly, any command-group function inside a `dynamic_command_group` will be +evaluated once, in index order, when submitted to the graph using +`command_graph::add()`. + Any code like this should be moved to a separate host-task and added to the graph via the recording or explicit APIs in order to be compatible with this extension. @@ -2243,6 +2440,50 @@ node nodeA = myGraph.add([&](handler& cgh) { dynParamAccessor.update(bufferB.get_access()); ---- +=== Dynamic Command Groups + +Example showing how a graph with a dynamic command group node can be updated. + +[source,c++] +---- +queue Queue{}; +exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + +int *PtrA = malloc_device(1024, Queue); +int *PtrB = malloc_device(1024, Queue)​ + +auto CgfA = [&](handler &cgh) { + cgh.parallel_for(1024, [=](item<1> Item) { + PtrA[Item.get_id()] = 1;​ + }); +}; + +auto CgfB = [&](handler &cgh) { + cgh.parallel_for(512, [=](item<1> Item) { + PtrB[Item.get_id()] = 2; + }); +}; + +// Construct a dynamic command-group with CgfA as the active cgf (index 0). +auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CgfA, CgfB}); + +// Create a dynamic command-group graph node. +auto DynamicCGNode = Graph.add(DynamicCG); + +auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + +// The graph will execute CgfA. +Queue.ext_oneapi_graph(ExecGraph).wait(); + +// Sets CgfB as active in the dynamic command-group (index 1). +DynamicCG.set_active_cgf(1); + +// Calls update to update the executable graph node with the changes to DynamicCG. +ExecGraph.update(DynamicCGNode); + +// The graph will execute CgfB. +Queue.ext_oneapi_graph(ExecGraph).wait(); +---- === Whole Graph Update Example that shows recording and updating several nodes with different @@ -2444,6 +2685,16 @@ to ensure this is desired and makes sense to users. **UNRESOLVED** Needs more discussion. +=== Updatable command-groups in the Record & Replay API: + +Currently the only way to update command-groups in a graph is to use the +Explicit API. There is a limitation in some backends that requires all +the command-groups used for updating to be specified before the graph +is finalized. This restriction makes it hard to implement the +Record & Replay API in a performant manner. + +**UNRESOLVED** Needs more discussion. + === Multi Device Graph Allow an executable graph to contain nodes targeting different devices. diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_prefetch.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_prefetch.asciidoc similarity index 98% rename from sycl/doc/extensions/proposed/sycl_ext_oneapi_prefetch.asciidoc rename to sycl/doc/extensions/experimental/sycl_ext_oneapi_prefetch.asciidoc index 4a035028ae6a1..d70e4c3bfbe26 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_prefetch.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_prefetch.asciidoc @@ -48,12 +48,11 @@ This extension also depends on the following other SYCL extensions: == Status -This is a proposed extension specification, intended to gather community -feedback. Interfaces defined in this specification may not be implemented yet -or may be in a preliminary state. The specification itself may also change in -incompatible ways before it is finalized. *Shipping software products should -not rely on APIs defined in this specification.* - +This is an experimental extension specification, intended to provide early +access to features and gather community feedback. Interfaces defined in this +specification are implemented in DPC++, but they are not finalized and may +change incompatibly in future versions of DPC++ without prior notice. *Shipping +software products should not rely on APIs defined in this specification.* == Overview diff --git a/sycl/doc/extensions/proposed/sycl_ext_codeplay_cuda_cluster_group.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_codeplay_cuda_cluster_group.asciidoc new file mode 100644 index 0000000000000..840e057f9d0b1 --- /dev/null +++ b/sycl/doc/extensions/proposed/sycl_ext_codeplay_cuda_cluster_group.asciidoc @@ -0,0 +1,420 @@ += sycl_ext_codeplay_cuda_cluster_group + +:source-highlighter: coderay +:coderay-linenums-mode: table + +// This section needs to be after the document title. +:doctype: book +:toc2: +:toc: left +:encoding: utf-8 +:lang: en +:dpcpp: pass:[DPC++] +:endnote: —{nbsp}end{nbsp}note + +// Set the default source code type in this document to C++, +// for syntax highlighting purposes. This is needed because +// docbook uses c++ and html5 uses cpp. +:language: {basebackend@docbook:c++:cpp} + + +== Notice + +[%hardbreaks] + +Copyright (C) 2024-2024 Codeplay Corporation. All rights reserved. + +Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks of +The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. used by +permission by Khronos. + +Other company and product names may be trademarks of the respective companies +with which they are associated and can be claimed as the property of others. + +== Contact + +To report problems with this extension, please open a new issue at: + +https://github.com/intel/llvm/issues + + +== Contributors +Atharva Dubey, Codeplay + +Gordon Brown, Codeplay + +== Dependencies + +This extension is written against the SYCL 2020 revision 8 specification. All +references below to the "core SYCL specification" or to section numbers in the +SYCL specification refer to that revision. + +This extensions also depends on the following other sycl extensions: + +* link:../experimental/sycl_ext_oneapi_enqueue_functions.asciidoc[ + sycl_ext_oneapi_enqueue_functions] +* link:../experimental/sycl/sycl_ext_oneapi_properties.asciidoc[ + sycl_ext_oneapi_properties +] + + +== Status + +This is a proposed extension specification, intended to gather community +feedback. Interfaces defined in this specification may not be implemented yet +or may be in a preliminary state. The specification itself may also change in +incompatible ways before it is finalized. *Shipping software products should +not rely on APIs defined in this specification.* + + +== Glossary + +* Compute Capability: Abbreviated as "cc", a number assigned to each generation +of NVIDIA's GPUs conveying the feature set associated with that number. + + + +== Overview + +CUDA compute capability (cc) 9.0 (sm_90 and above) devices introduces a new level in the +thread hierarchy, called as thread block clusters, in CUDA terminology. A thread +block cluster, is a collection of thread blocks (a work-group in SYCL +terminology) that run concurrently. The work-groups which make up a cluster +have the ability to access one another's local memory, and can be synchronized. +This has various applications, convolutions, GEMMs and FFTs to name a few. + +This proposal introduces a SYCL API to expose these capabilities, and defines a +mechanism to launch a kernel with clusters enabled, access the cluster's various +ranges and id's from the device code, atomics at the cluster level as well as +synchronize the cluster. This proposal also introduces a device aspect to check +if the SYCL device supports a cluster launch, and a device query to obtain the +maximum supported cluster size. + + +== Specification + +=== Feature test macro + +This extension provides a feature-test macro as described in the core SYCL +specification. An implementation supporting this extension must predefine the +macro `SYCL_EXT_CODEPLAY_CUDA_CLUSTER_GROUP` to one of the values defined in the +table below. Applications can test for the existence of this macro to determine +if the implementation supports this feature, or applications can test the +macro's value to determine which of the extension's features the implementation +supports + +[%header,cols="1,5"] +|=== +|Value +|Description + +|1 +|The APIs of this experimental extension are not version-ed, so the + feature-test macro always has this value. +|=== + + +=== Extension to `enum class aspect` + +[source] +---- +namespace sycl { +enum class aspect { + ... + ext_codeplay_cuda_cluster_group +} +} +---- + +A device requires the `ext_codeplay_cuda_cluster_group` aspect to +support launching a kernel with the `cluster_size` property defined in the +following section. + + +=== Launching a kernel with a `cluster_group` + +Because of the special scheduling guarantees associated with a cluster launch, +the backend must know which kernel would be using this feature. Thus, this +proposal introduces a new launch property called as `cluster_size` that will +contain the cluster size as a number of work-groups. + +[source,c++] +---- +namespace sycl::ext::codeplay::experimental::cuda { +/** +* Dim Dimensionality of the launch +* size sycl::range specifying the number of work-groups in the cluster + in each dimension. +*/ +template +struct cluster_size { + cluster_size(const sycl::range& size); + sycl::range get_cluster_size(); + ... +}; +using cluster_size_key = cluster_size; +} // namespace sycl::ext::codeplay::experimental::cuda +---- + +The property list can the be constructed as follows - + +[source,c++] +---- +properties cluster_launch_property{cluster_size({1, 2, 1})}; +---- + +[_Note:_ the total number of work-groups in the kernel must be a multiple of +the cluster size in each dimension. _{endnote}_] + +The launch functions introduced in `sycl_ext_oneapi_enqueue_functions` can then +be used to launch the kernel with the `cluster_size` property. + + +=== Querying Maximum Cluster Size + +To query the maximum supported cluster size, this proposal adds a new device +query, `max_cluster_group_size`, which returns the maximum possible number of +work-groups present inside the cluster. + +[source, c++] +---- +size_t max_cluster_size = + device.get_info< + ext::codeplay::experimental::cuda::info::device::max_cluster_group_size>(); +---- +[%header,cols="10,5,5"] +|=== +|Device descriptor +|Return Type +|Description + +|`ext::codeplay::experimental::cuda::info::device::max_cluster_group_size` +|size_t +|Returns the maximum possible number of work-groups that can constitute a +cluster-group +|=== + + +=== Accessing the Cluster Group From Device Code + +Building upon the group hierarchy in SYCL, this proposal adds another level +above group (for work-groups), to be called as `cluster-group`, which +represents a collection of work-groups and will be accessible via the `nd_item` +class, via a member function to be introduced called `ext_codeplay_cuda_get_cluster_group()`. + + +[%header,cols="10,5"] +|=== +|Method +|Description + +|`cluster_group nd_item::ext_codeplay_cuda_get_cluster_group()` +|Returns the constituent `cluster_group` in the kernel, representing this +`cluster_group` object's overall position in the `nd_range` +|=== + + +The `cluster_group` class will contain the following member functions, to access +the various ids of the work-item and work-groups. + +[source,c++] +---- + template + class cluster_group { + public: + using id_type = id; + using range_type = range; + using linear_id_type = size_t; + + linear_id_type get_group_linear_id() const; + + linear_id_type get_local_linear_id() const; + + range_type get_group_range() const; + + id_type get_group_id() const; + + id_type get_local_id() const; + + range_type get_local_range() const; + + linear_id_type get_local_linear_range() const; + + linear_id_type get_group_linear_range() const; + + bool leader() const; + + static constexpr memory_scope fence_scope = + memory_scope::ext_codeplay_cuda_cluster_group; + } +---- + + +[%header,cols="5,5"] +|=== +|Method +|Description + +|`linear_id get_group_linear_id() const` +|Returns the linearized id of the calling work-group within the cluster. + +|`linear_id get_local_linear_id() const` +|Returns the linearized index of the calling work-item within the cluster. + +|`range_type get_group_range() const` +|Returns the number of work-groups in each dimension within the cluster. + +|`id_type get_group_id() const` +|Returns the id of the calling work-group along each dimension within the cluster. + +|`id_type get_local_id() const`; +|Returns the id of calling work-item along each dimension within the cluster. + +|`range_type get_local_range() const`; +|Returns the number of work-items along each dimension within the cluster. + +|`linear_id_type get_local_linear_range() const`; +|Returns a linearized version of the `range_type` returned by `get_local_range` + +|`linear_id_type get_group_linear_range() const`; +|Returns a linearized version of the `range_type` returned by `get_group_range` + +|`bool leader() const`; +|Returns true for exactly one work-item in the cluster, if the calling work-item +is the leader of the cluster group. The leader is guaranteed to be the work-item +for which `get_local_linear_id` return 0. +|=== + + +== Accessing another work-group's local memory + +Work-groups within the cluster have the ability to access another work-group's +local memory. Typically addresses which reside in the local memory of a +work-group can only be accessed by the work-items of that work-group. +Therefore, to access another work-group's local memory, the address needs to be +mapped such that the address in another work-group is accessible within the +calling work-item. Further, to access another work-group's local memory, +all the work-groups within the cluster must exist and the work-groups should +not cease to exist before all the memory operations are completed. This can be +ensured by synchronizing all the work-items within the cluster before and after +the local memory operations, using `group_barrier`. + +A member function of the `cluster_group` class; +`map_cluster_local_pointer` will perform the mapping and return a pointer +which can then be dereferenced by the calling work-item. + + +[%header,cols="10,5"] +|=== +|Method +|Description + +|T* map_cluster_local_pointer(T* addr, size_t group_id) +|Accepts the equivalent address to the memory location relative to the calling +work-item which is to be mapped from the local memory of the work-group, as +specified by `group_id`, denoting the linear group id within the cluster +|=== + +Conversely, `get_cluster_group_linear_id_for_local_pointer` will return the +linearized id of the work-group a mapped local memory address belongs to. + +[%header,cols="10,5"] +|=== +|Method +|Description + +|size_t get_cluster_group_linear_id_for_local_pointer(T* addr) +|Accepts a pointer pointing to a valid local memory space, and the returns the +linearized id of the work-group in the cluster that address belongs to. +|=== + + +== Cluster Memory Fence Scope and Barrier + +Work-items in a work-group can access a local memory address from another +work-group in the cluster-group, which has been mapped as described above. To +facilitate this, a new memory scope is introduced to the `memory_scope` class; +`ext_codeplay_cuda_cluster_group` which indicates a memory ordering +constraint that applies to all work-items in the same cluster-group. This memory +scope can be used with `atomic_ref` and other SYCL APIs that use +`memory_scope`. + +[source, c++] +---- +namespace sycl { + + enum class memory_scope { + ... + ext_codeplay_cuda_cluster_group, + ... + }; + + namespace ext::codeplay::experimental::cuda { + inline constexpr auto memory_scope_cluster_group + = memory_scope::ext_codeplay_cuda_cluster_group; + } // namespace ext::codeplay::experimental::cuda +} // namespace sycl +---- + + +To coordinate all work-items in the cluster group, `sycl::group_barrier` can be +used, accepting the `cluster_group` class. + + +== Example + +This section adds a representative example of how to launch a kernel with +the cluster-range specified and accessing various id's within the kernel - + +[source,c++] +---- +sycl::event launch_kernel_with_cluster() { + namespace syclcp = sycl::ext::codeplay::experimental; + namespace syclex = sycl::ext::oneapi::experimental; + + sycl::queue q; + + sycl::nd_range<3> ndRange({4096, 4096, 32}, {32, 32, 1}); + syclex::properties props(syclcp::cuda::cluster_size({4, 4, 1})); + syclex::launch_config config(ndRange, props); + + return syclex::submit_with_event(q, [&](sycl::handler& cgh){ + syclex::nd_launch(cgh, config, [=](sycl::nd_item<3> it) { + auto cg = it.ext_codeplay_cuda_get_cluster_group(); + auto cgId = cg.get_group_id(); + ... + }); + }) +} +---- + + +== Known Issues + +. Forward Progress Guarantees ++ +-- +*UNRESOLVED* This Specification does not discuss the forward progress guarantees of the + cluster_group. +-- + +. Differentiating between decorated and generic address spaces ++ +-- +*UNRESOLVED* The functions `map_cluster_local_pointer` and +`get_cluster_group_linear_id_for_local_pointer` do not differentiate between +generic and local memory address spaces, which might not be the most efficient. +-- + + +== Revision History + +[cols="5,15,15,70"] +[grid="rows"] +[options="header"] +|======================================== +|Rev|Date|Authors|Changes +|4|2024-06-26|Atharva Dubey, Jack Kirk|Added device query and aspects, + review comments and additional cluster group member functions +|2|2024-05-09|Atharva Dubey|Using enqueue functions to launch with properties +|1|2024-04-29|Atharva Dubey|Initial public working draft +|======================================== \ No newline at end of file diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_barrier.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_barrier.asciidoc index 2c3df707f0dfc..2e91bb675ac84 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_barrier.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_barrier.asciidoc @@ -129,16 +129,16 @@ namespace sycl::ext::oneapi::experimental { public: using arrival_token = __unspecified__; - static constexpr ptrdiff_t max() noexcept; + static constexpr std::ptrdiff_t max() noexcept; - constexpr explicit barrier(ptrdiff_t expected, + constexpr explicit barrier(std::ptrdiff_t expected, CompletionFunction f = CompletionFunction()); ~barrier(); barrier(const barrier&) = delete; barrier& operator=(const barrier&) = delete; - [[nodiscard]] arrival_token arrive(ptrdiff_t update = 1); + [[nodiscard]] arrival_token arrive(std::ptrdiff_t update = 1); void wait(arrival_token&& arrival) const; void arrive_and_wait(); @@ -150,14 +150,14 @@ namespace sycl::ext::oneapi::experimental { [source,c++] ---- -static constexpr ptrdiff_t max() noexcept; +static constexpr std::ptrdiff_t max() noexcept; ---- _Returns_: The maximum number of threads of execution that can be synchronized by any `barrier` with the specified `Scope` and `CompletionFunction`. [source,c++] ---- -constexpr explicit barrier(ptrdiff_t expected, CompletionFunction f = CompletionFunction()); +constexpr explicit barrier(std::ptrdiff_t expected, CompletionFunction f = CompletionFunction()); ---- _Preconditions_: If `Scope` is `memory_scope::work_group`, the calling thread of execution must be a work-item belonging to the work-group that will use the @@ -184,7 +184,7 @@ concurrently introduces a data race. [source,c++] ---- -[[nodiscard]] arrival_token arrive(ptrdiff_t update = 1); +[[nodiscard]] arrival_token arrive(std::ptrdiff_t update = 1); ---- _Effects_: The calling thread of execution arrives at the barrier and decreases the expected count by `update`. @@ -240,6 +240,7 @@ extension. [source,c++] ---- +namespace syclex = sycl::ext::oneapi::experimental; using work_group_barrier = syclex::barrier; q.parallel_for(..., [=](sycl::nd_item it) { @@ -264,6 +265,7 @@ initialized on the device that will use the barrier. [source,c++] ---- +namespace syclex = sycl::ext::oneapi::experimental; using device_barrier = syclex::barrier; // Allocate memory for the barrier @@ -306,6 +308,7 @@ accessible by the host. [source,c++] ---- +namespace syclex = sycl::ext::oneapi::experimental; using system_barrier = syclex::barrier; // Allocate memory for the barrier diff --git a/sycl/doc/syclcompat/README.md b/sycl/doc/syclcompat/README.md index 127df2d17cac9..6dd8708afeb62 100644 --- a/sycl/doc/syclcompat/README.md +++ b/sycl/doc/syclcompat/README.md @@ -42,7 +42,14 @@ Specifically, this library depends on the following SYCL extensions: ../extensions/supported/sycl_ext_oneapi_assert.asciidoc) * [sycl_ext_oneapi_enqueue_barrier]( ../extensions/supported/sycl_ext_oneapi_enqueue_barrier.asciidoc) -* [sycl_ext_oneapi_usm_device_read_only](../extensions/supported/sycl_ext_oneapi_usm_device_read_only.asciidoc) +* [sycl_ext_oneapi_usm_device_read_only]( + ../extensions/supported/sycl_ext_oneapi_usm_device_read_only.asciidoc) +* [sycl_ext_oneapi_properties]( + ../extensions/experimental/sycl_ext_oneapi_properties.asciidoc) +* [sycl_ext_oneapi_enqueue_functions]( + ../extensions/experimental/sycl_ext_oneapi_enqueue_functions.asciidoc) +* [sycl_ext_oneapi_kernel_properties]( + ../extensions/experimental/sycl_ext_oneapi_kernel_properties.asciidoc) If available, the following extensions extend SYCLcompat functionality: @@ -206,44 +213,6 @@ These translate any kernel dimensions from one convention to the other. An example of an equivalent SYCL call for a 3D kernel using `compat` is `syclcompat::global_id::x() == get_global_id(2)`. -### Local Memory - -When using `compat` functions, there are two distinct interfaces to allocate -device local memory. The first interface uses the _sycl_ext_oneapi_local_memory_ -extension to leverage local memory defined at compile time. -_sycl_ext_oneapi_local_memory_ is accessed through the following wrapper: - -``` c++ -namespace syclcompat { - -template auto *local_mem(); - -} // syclcompat -``` - -`syclcompat::local_mem()` can be used as illustrated in the example -below. - -```c++ -// Sample kernel -using namespace syclcompat; -template -void local_mem_2d(int *d_A) { - // Local memory extension wrapper, size defined at compile-time - auto As = local_mem(); - int id_x = local_id::x(); - int id_y = local_id::y(); - As[id_y][id_x] = id_x * BLOCK_SIZE + id_y; - wg_barrier(); - int val = As[BLOCK_SIZE - id_y - 1][BLOCK_SIZE - id_x - 1]; - d_A[global_id::y() * BLOCK_SIZE + global_id::x()] = val; -} -``` - -The second interface allows users to allocate device local memory at runtime. -SYCLcompat provides this functionality through its kernel launch interface, -`launch`, defined in the following section. - ### launch SYCLcompat provides a kernel `launch` interface which accepts a function that @@ -254,7 +223,7 @@ device _function_ with the use of an `auto F` template parameter, and a variadic `Args` for the function's arguments. Various overloads for `launch` exist to permit the user to launch on a -specific `queue`, or to define dynamically sized device local memory. +specific `queue`, or to describe the range as either `nd_range` or `dim3, dim3`. ``` c++ namespace syclcompat { @@ -273,22 +242,6 @@ template sycl::event launch(const dim3 &grid, const dim3 &threads, sycl::queue q, Args... args); -template -sycl::event launch(const sycl::nd_range &range, size_t mem_size, - sycl::queue q, Args... args); - -template -sycl::event launch(const sycl::nd_range &range, size_t mem_size, - Args... args); - -template -sycl::event launch(const dim3 &grid, const dim3 &threads, - size_t mem_size, sycl::queue q, Args... args); - -template -sycl::event launch(const dim3 &grid, const dim3 &threads, - size_t mem_size, Args... args); - } // syclcompat ``` @@ -313,67 +266,156 @@ auto range = sycl::nd_range<3>{blocksPerGrid * threadsPerBlock, syclcompat::launch(range, d_A, d_B, d_C, n); ``` -For dynamic local memory allocation, `launch` injects a pointer to a -local `char *` accessor of `mem_size` as the last argument of the kernel -function. For example, the previous function named `vectorAdd` can be modified -with the following signature, which adds a `char *` pointer to access local -memory inside the kernel: +Note that since `syclcompat::launch` accepts a device function, the kernel +lambda is constructed by SYCLcompat internally. This means that, for +example, `sycl::local_accessor`s cannot be declared. Instead, users wishing to +use local memory should launch with a `launch_policy` object as described below. -``` c++ -void vectorAdd(const float *A, const float *B, float *C, int n, - char *local_mem); +#### launch_policy + +In addition to the simple `syclcompat::launch` interface described above, +SYCLcompat provides a more flexible (`experimental`) interface to `launch` a +kernel with a given `launch_policy`. By constructing and passing a +`launch_policy`, users can pass `sycl::ext::oneapi::experimental::properties` +associated with the kernel or launch, as well as request **local memory** for +the kernel. + +In order to disambiguate the variadic constructor of `launch_policy`, the +following wrapper structs are defined. The `kernel_properties` and +`launch_properties` wrappers can be constructed *either* with a variadc set of +properties, or with an existing `sycl_exp::properties` object. + +```cpp +namespace syclcompat::experimental { +namespace sycl_exp = sycl::ext::oneapi::experimental; + +// Wrapper for kernel sycl_exp::properties +template struct kernel_properties { + using Props = Properties; + template + kernel_properties(Props... properties); + template + kernel_properties(sycl_exp::properties properties) + Properties props; +}; + +// Wrapper for launch sycl_exp::properties +template struct launch_properties { + using Props = Properties; + template + launch_properties(Props... properties); + template + launch_properties(sycl_exp::properties properties) + Properties props; +}; + +// Wrapper for local memory size +struct local_mem_size { + local_mem_size(size_t size = 0); + size_t size; +}; + +} //namespace syclcompat::experimental ``` -Then, `vectorAdd` can be launched like this: +The constructors of `launch_policy` are variadic, accepting any form of range +(`nd_range`, `range`, `dim3`, `dim3, dim3`), followed by zero or more of +`local_memory_size`, `kernel_properties`, and `launch_properties`: ``` c++ -syclcompat::launch(blocksPerGrid, threadsPerBlock, mem_size, d_A, - d_B, d_C, n); +namespace syclcompat::experimental { +namespace sycl_exp = sycl::ext::oneapi::experimental; + +// launch_policy is constructed by the user & passed to `compat_exp::launch` +template +class launch_policy { +public: + using KPropsT = KProps; + using LPropsT = LProps; + using RangeT = Range; + static constexpr bool HasLocalMem = LocalMem; + + template + launch_policy(Range range, Ts... ts); + + template + launch_policy(dim3 global_range, Ts... ts); + + template + launch_policy(dim3 global_range, dim3 local_range, Ts... ts); + + KProps get_kernel_properties(); + LProps get_launch_properties(); + size_t get_local_mem_size(); + Range get_range(); +}; +} //namespace syclcompat::experimental ``` -or this: +The `launch` overloads accepting a `launch_policy` are: + +```cpp +namespace syclcompat::experimental { + +template +sycl::event launch(LaunchPolicy launch_policy, sycl::queue q, Args... args); + +template +sycl::event launch(LaunchPolicy launch_policy, Args... args); +} //namespace syclcompat::experimental -``` c++ -auto range = sycl::nd_range<3>{globalSize, localSize}; -syclcompat::launch(range, mem_size, d_A, d_B, d_C, n); ``` -This `launch` interface allows users to define an internal memory pool, or -scratchpad, that can then be reinterpreted as the datatype required by the user -within the kernel function. +For local memory, `launch` injects a `char *` pointer to the beginning +of a local accessor of the requested `local_mem_size` as the last argument of +the kernel function. This `char *` can then be reinterpreted as the datatype +required by the user within the kernel function. -To launch a kernel with a specified sub-group size, overloads similar to above -`launch` functions are present in the `syclcompat::experimental` namespace, -which accept SubgroupSize as a template parameter and can be called as -`launch` +For example, the previous function named `vectorAdd` can be modified +with the following signature, which adds a `char *` pointer to access local +memory inside the kernel: -```cpp +``` c++ +void vectorAdd(const float *A, const float *B, float *C, int n, + char *local_mem); +``` + +Then, the new `vectorAdd` can be launched like this: + +``` c++ +using syclcompat::experimental; +launch_policy policy{blocksPerGrid, threadsPerBlock, + local_mem_size(nbytes)}; +launch(policy, d_A, d_B, d_C, n); +``` -template -sycl::event launch(sycl::nd_range<3> launch_range, std::size_t local_memory_size, - sycl::queue queue, Args... args); +To request a different cache/local memory split on supported hardware: -template -sycl::event launch(sycl::nd_range launch_range, std::size_t local_memory_size, - Args... args); +```c++ +using syclcompat::experimental; +namespace sycl_intel_exp = sycl::ext::intel::experimental; -template -sycl::event launch(::syclcompat::dim3 grid_dim, ::syclcompat::dim3 block_dim, - std::size_t local_memory_size, Args... args); +sycl_intel_exp::cache_config cache_config{ + sycl_intel_exp::large_slm}; +kernel_properties kernel_props{cache_config}; +launch_policy policy{blocksPerGrid, threadsPerBlock, + local_mem_size(nbytes), kernel_props}; +launch(policy, d_A, d_B, d_C, n); +``` -template -sycl::event launch(sycl::nd_range<3> launch_range, sycl::queue queue, - Args... args); +To request a certain cluster dimension on supported hardware: -template -sycl::event launch(sycl::nd_range launch_range, - Args... args); +```c++ +using syclcompat::experimental; +namespace sycl_exp = sycl::ext::oneapi::experimental; -template -sycl::event launch(::syclcompat::dim3 grid_dim, ::syclcompat::dim3 block_dim, - Args... args); +sycl_exp::cuda::cluster_size cluster_dims(cluster_range); +launch_policy policy{blocksPerGrid, threadsPerBlock, + local_mem_size(nbytes), + launch_properties{cluster_dims}}; +launch(policy, d_A, d_B, d_C, n); ``` ### Utilities diff --git a/sycl/include/sycl/bit_cast.hpp b/sycl/include/sycl/bit_cast.hpp index 09148847eefe3..1f6573e54de6b 100644 --- a/sycl/include/sycl/bit_cast.hpp +++ b/sycl/include/sycl/bit_cast.hpp @@ -61,7 +61,7 @@ constexpr static_assert(std::is_trivially_default_constructible::value, "To must be trivially default constructible"); To to; - sycl::detail::memcpy(&to, &from, sizeof(To)); + sycl::detail::memcpy_no_adl(&to, &from, sizeof(To)); return to; #endif } diff --git a/sycl/include/sycl/detail/group_sort_impl.hpp b/sycl/include/sycl/detail/group_sort_impl.hpp index b33466b960c54..b3f72ddde09f6 100644 --- a/sycl/include/sycl/detail/group_sort_impl.hpp +++ b/sycl/include/sycl/detail/group_sort_impl.hpp @@ -495,28 +495,28 @@ struct ScratchMemory { operator T() const { T value{0}; - detail::memcpy(&value, MPtr, sizeof(T)); + detail::memcpy_no_adl(&value, MPtr, sizeof(T)); return value; } T operator++(int) noexcept { T value{0}; - detail::memcpy(&value, MPtr, sizeof(T)); + detail::memcpy_no_adl(&value, MPtr, sizeof(T)); T value_before = value++; - detail::memcpy(MPtr, &value, sizeof(T)); + detail::memcpy_no_adl(MPtr, &value, sizeof(T)); return value_before; } T operator++() noexcept { T value{0}; - detail::memcpy(&value, MPtr, sizeof(T)); + detail::memcpy_no_adl(&value, MPtr, sizeof(T)); ++value; - detail::memcpy(MPtr, &value, sizeof(T)); + detail::memcpy_no_adl(MPtr, &value, sizeof(T)); return value; } ReferenceObj &operator=(const T &value) noexcept { - detail::memcpy(MPtr, &value, sizeof(T)); + detail::memcpy_no_adl(MPtr, &value, sizeof(T)); return *this; } @@ -531,7 +531,7 @@ struct ScratchMemory { } void copy(const ReferenceObj &value) noexcept { - detail::memcpy(MPtr, value.MPtr, sizeof(T)); + detail::memcpy_no_adl(MPtr, value.MPtr, sizeof(T)); } private: diff --git a/sycl/include/sycl/detail/memcpy.hpp b/sycl/include/sycl/detail/memcpy.hpp index 9e2eac2b30b7c..b35d03a553385 100644 --- a/sycl/include/sycl/detail/memcpy.hpp +++ b/sycl/include/sycl/detail/memcpy.hpp @@ -13,7 +13,13 @@ namespace sycl { inline namespace _V1 { namespace detail { -inline void memcpy(void *Dst, const void *Src, size_t Size) { +// Using "memcpy_no_adl" function name instead of "memcpy" to prevent +// ambiguity with libc's memcpy. Even though they are in a different +// namespace, due to ADL, compiler may lookup "memcpy" symbol in the +// sycl::detail namespace, like in the following code: +// sycl::vec a, b; +// memcpy(&a, &b, sizeof(sycl::vec)); +inline void memcpy_no_adl(void *Dst, const void *Src, size_t Size) { #ifdef __SYCL_DEVICE_ONLY__ __builtin_memcpy(Dst, Src, Size); #else diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index 76d4c81a93fe8..90436366a20ea 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -397,9 +397,9 @@ EnableIfGenericBroadcast GroupBroadcast(Group g, T x, IdT local_id) { char *ResultBytes = reinterpret_cast(&Result); auto BroadcastBytes = [=](size_t Offset, size_t Size) { uint64_t BroadcastX, BroadcastResult; - detail::memcpy(&BroadcastX, XBytes + Offset, Size); + detail::memcpy_no_adl(&BroadcastX, XBytes + Offset, Size); BroadcastResult = GroupBroadcast(g, BroadcastX, local_id); - detail::memcpy(ResultBytes + Offset, &BroadcastResult, Size); + detail::memcpy_no_adl(ResultBytes + Offset, &BroadcastResult, Size); }; GenericCall(BroadcastBytes); return Result; @@ -449,9 +449,9 @@ EnableIfGenericBroadcast GroupBroadcast(Group g, T x, char *ResultBytes = reinterpret_cast(&Result); auto BroadcastBytes = [=](size_t Offset, size_t Size) { uint64_t BroadcastX, BroadcastResult; - detail::memcpy(&BroadcastX, XBytes + Offset, Size); + detail::memcpy_no_adl(&BroadcastX, XBytes + Offset, Size); BroadcastResult = GroupBroadcast(g, BroadcastX, local_id); - detail::memcpy(ResultBytes + Offset, &BroadcastResult, Size); + detail::memcpy_no_adl(ResultBytes + Offset, &BroadcastResult, Size); }; GenericCall(BroadcastBytes); return Result; @@ -1104,9 +1104,9 @@ EnableIfGenericShuffle Shuffle(GroupT g, T x, id<1> local_id) { char *ResultBytes = reinterpret_cast(&Result); auto ShuffleBytes = [=](size_t Offset, size_t Size) { ShuffleChunkT ShuffleX, ShuffleResult; - detail::memcpy(&ShuffleX, XBytes + Offset, Size); + detail::memcpy_no_adl(&ShuffleX, XBytes + Offset, Size); ShuffleResult = Shuffle(g, ShuffleX, local_id); - detail::memcpy(ResultBytes + Offset, &ShuffleResult, Size); + detail::memcpy_no_adl(ResultBytes + Offset, &ShuffleResult, Size); }; GenericCall(ShuffleBytes); return Result; @@ -1119,9 +1119,9 @@ EnableIfGenericShuffle ShuffleXor(GroupT g, T x, id<1> local_id) { char *ResultBytes = reinterpret_cast(&Result); auto ShuffleBytes = [=](size_t Offset, size_t Size) { ShuffleChunkT ShuffleX, ShuffleResult; - detail::memcpy(&ShuffleX, XBytes + Offset, Size); + detail::memcpy_no_adl(&ShuffleX, XBytes + Offset, Size); ShuffleResult = ShuffleXor(g, ShuffleX, local_id); - detail::memcpy(ResultBytes + Offset, &ShuffleResult, Size); + detail::memcpy_no_adl(ResultBytes + Offset, &ShuffleResult, Size); }; GenericCall(ShuffleBytes); return Result; @@ -1134,9 +1134,9 @@ EnableIfGenericShuffle ShuffleDown(GroupT g, T x, uint32_t delta) { char *ResultBytes = reinterpret_cast(&Result); auto ShuffleBytes = [=](size_t Offset, size_t Size) { ShuffleChunkT ShuffleX, ShuffleResult; - detail::memcpy(&ShuffleX, XBytes + Offset, Size); + detail::memcpy_no_adl(&ShuffleX, XBytes + Offset, Size); ShuffleResult = ShuffleDown(g, ShuffleX, delta); - detail::memcpy(ResultBytes + Offset, &ShuffleResult, Size); + detail::memcpy_no_adl(ResultBytes + Offset, &ShuffleResult, Size); }; GenericCall(ShuffleBytes); return Result; @@ -1149,9 +1149,9 @@ EnableIfGenericShuffle ShuffleUp(GroupT g, T x, uint32_t delta) { char *ResultBytes = reinterpret_cast(&Result); auto ShuffleBytes = [=](size_t Offset, size_t Size) { ShuffleChunkT ShuffleX, ShuffleResult; - detail::memcpy(&ShuffleX, XBytes + Offset, Size); + detail::memcpy_no_adl(&ShuffleX, XBytes + Offset, Size); ShuffleResult = ShuffleUp(g, ShuffleX, delta); - detail::memcpy(ResultBytes + Offset, &ShuffleResult, Size); + detail::memcpy_no_adl(ResultBytes + Offset, &ShuffleResult, Size); }; GenericCall(ShuffleBytes); return Result; diff --git a/sycl/include/sycl/device_aspect_macros.hpp b/sycl/include/sycl/device_aspect_macros.hpp index 385afb05c3ce4..b3b614fae58ce 100644 --- a/sycl/include/sycl/device_aspect_macros.hpp +++ b/sycl/include/sycl/device_aspect_macros.hpp @@ -233,14 +233,14 @@ #define __SYCL_ALL_DEVICES_HAVE_ext_oneapi_bindless_images_2d_usm__ 0 #endif -#ifndef __SYCL_ALL_DEVICES_HAVE_ext_oneapi_interop_memory_import__ -//__SYCL_ASPECT(ext_oneapi_interop_memory_import, 46) -#define __SYCL_ALL_DEVICES_HAVE_ext_oneapi_interop_memory_import__ 0 +#ifndef __SYCL_ALL_DEVICES_HAVE_ext_oneapi_external_memory_import__ +//__SYCL_ASPECT(ext_oneapi_external_memory_import, 46) +#define __SYCL_ALL_DEVICES_HAVE_ext_oneapi_external_memory_import__ 0 #endif -#ifndef __SYCL_ALL_DEVICES_HAVE_ext_oneapi_interop_semaphore_import__ -//__SYCL_ASPECT(ext_oneapi_interop_semaphore_import, 48) -#define __SYCL_ALL_DEVICES_HAVE_ext_oneapi_interop_semaphore_import__ 0 +#ifndef __SYCL_ALL_DEVICES_HAVE_ext_oneapi_external_semaphore_import__ +//__SYCL_ASPECT(ext_oneapi_external_semaphore_import, 48) +#define __SYCL_ALL_DEVICES_HAVE_ext_oneapi_external_semaphore_import__ 0 #endif #ifndef __SYCL_ALL_DEVICES_HAVE_ext_oneapi_mipmap__ @@ -615,14 +615,14 @@ #define __SYCL_ANY_DEVICE_HAS_ext_oneapi_bindless_images_2d_usm__ 0 #endif -#ifndef __SYCL_ANY_DEVICE_HAS_ext_oneapi_interop_memory_import__ -//__SYCL_ASPECT(ext_oneapi_interop_memory_import, 46) -#define __SYCL_ANY_DEVICE_HAS_ext_oneapi_interop_memory_import__ 0 +#ifndef __SYCL_ANY_DEVICE_HAS_ext_oneapi_external_memory_import__ +//__SYCL_ASPECT(ext_oneapi_external_memory_import, 46) +#define __SYCL_ANY_DEVICE_HAS_ext_oneapi_external_memory_import__ 0 #endif -#ifndef __SYCL_ANY_DEVICE_HAS_ext_oneapi_interop_semaphore_import__ -//__SYCL_ASPECT(ext_oneapi_interop_semaphore_import, 48) -#define __SYCL_ANY_DEVICE_HAS_ext_oneapi_interop_semaphore_import__ 0 +#ifndef __SYCL_ANY_DEVICE_HAS_ext_oneapi_external_semaphore_import__ +//__SYCL_ASPECT(ext_oneapi_external_semaphore_import, 48) +#define __SYCL_ANY_DEVICE_HAS_ext_oneapi_external_semaphore_import__ 0 #endif #ifndef __SYCL_ANY_DEVICE_HAS_ext_oneapi_mipmap__ diff --git a/sycl/include/sycl/ext/oneapi/bindless_images.hpp b/sycl/include/sycl/ext/oneapi/bindless_images.hpp index 447e66ea50e98..dcd707aaa0be3 100644 --- a/sycl/include/sycl/ext/oneapi/bindless_images.hpp +++ b/sycl/include/sycl/ext/oneapi/bindless_images.hpp @@ -12,7 +12,7 @@ #include // for __SYCL_EXPORT #include // for device #include // for image_desc... -#include // for interop_me... +#include // for external_m... #include // for image_mem_... #include // for bindless_i... #include // for image_chan... @@ -124,150 +124,134 @@ get_mip_level_mem_handle(const image_mem_handle mipMem, unsigned int level, const sycl::queue &syclQueue); /** - * @brief Import external memory taking an external memory handle (the type - * of which is dependent on the OS & external API) and return an - * interop memory handle + * @brief Import external memory taking an external memory descriptor (the + * type of which is dependent on the OS & external API) and return an + * imported external memory object * - * @tparam ExternalMemHandleType Handle type describing external memory handle - * @param externalMem External memory descriptor - * @param syclDevice The device in which we create our interop memory - * @param syclContext The context in which we create our interop memory - * handle - * @return Interop memory handle to the external memory + * @tparam ResourceType Resource type differentiating external resource types + * @param externalMemDesc External memory descriptor + * @param syclDevice The device in which we create our external memory + * @param syclContext The context in which we create our external memory + * @return Imported opaque external memory */ -template -__SYCL_EXPORT interop_mem_handle import_external_memory( - external_mem_descriptor externalMem, +template +__SYCL_EXPORT external_mem import_external_memory( + external_mem_descriptor externalMemDesc, const sycl::device &syclDevice, const sycl::context &syclContext); /** - * @brief Import external memory taking an external memory handle (the type - * of which is dependent on the OS & external API) and return an - * interop memory handle + * @brief Import external memory taking an external memory descriptor (the + * type of which is dependent on the OS & external API) and return an + * imported external memory object * - * @tparam ExternalMemHandleType Handle type describing external memory handle - * @param externalMem External memory descriptor - * @param syclQueue The queue in which we create our interop memory - * handle - * @return Interop memory handle to the external memory + * @tparam ResourceType Resource type differentiating external resource types + * @param externalMemDesc External memory descriptor + * @param syclQueue The queue in which we create our external memory + * @return Imported opaque external memory */ -template -__SYCL_EXPORT interop_mem_handle import_external_memory( - external_mem_descriptor externalMem, - const sycl::queue &syclQueue); +template +__SYCL_EXPORT external_mem +import_external_memory(external_mem_descriptor externalMemDesc, + const sycl::queue &syclQueue); /** - * @brief Maps an interop memory handle to an image memory handle (which may + * @brief Maps an external memory object to an image memory handle (which may * have a device optimized memory layout) * - * @param memHandle Interop memory handle + * @param extMem External memory object * @param desc The image descriptor - * @param syclDevice The device in which we create our image memory handle + * @param syclDevice The device in which we create our image memory handle * @param syclContext The conext in which we create our image memory handle * @return Memory handle to externally allocated memory on the device */ __SYCL_EXPORT -image_mem_handle map_external_image_memory(interop_mem_handle memHandle, +image_mem_handle map_external_image_memory(external_mem extMem, const image_descriptor &desc, const sycl::device &syclDevice, const sycl::context &syclContext); /** - * @brief Maps an interop memory handle to an image memory handle (which may + * @brief Maps an external memory handle to an image memory handle (which may * have a device optimized memory layout) * - * @param memHandle Interop memory handle + * @param extMem External memory object * @param desc The image descriptor * @param syclQueue The queue in which we create our image memory handle * @return Memory handle to externally allocated memory on the device */ __SYCL_EXPORT -image_mem_handle map_external_image_memory(interop_mem_handle memHandle, +image_mem_handle map_external_image_memory(external_mem extMem, const image_descriptor &desc, const sycl::queue &syclQueue); /** - * @brief Import external semaphore taking an external semaphore handle (the - * type of which is dependent on the OS & external API) + * @brief Import external semaphore taking an external semaphore descriptor + * (the type of which is dependent on the OS & external API) * - * @tparam ExternalSemaphoreHandleType Handle type describing external - * semaphore handle + * @tparam ResourceType Resource type differentiating external resource types * @param externalSemaphoreDesc External semaphore descriptor - * @param syclDevice The device in which we create our interop semaphore - * handle - * @param syclContext The context in which we create our interop semaphore - * handle - * @return Interop semaphore handle to the external semaphore + * @param syclDevice The device in which we create our external semaphore + * @param syclContext The context in which we create our external semaphore + * @return Imported opaque external semaphore */ -template -__SYCL_EXPORT interop_semaphore_handle import_external_semaphore( - external_semaphore_descriptor - externalSemaphoreDesc, +template +__SYCL_EXPORT external_semaphore import_external_semaphore( + external_semaphore_descriptor externalSemaphoreDesc, const sycl::device &syclDevice, const sycl::context &syclContext); /** - * @brief Import external semaphore taking an external semaphore handle (the - * type of which is dependent on the OS & external API) + * @brief Import external semaphore taking an external semaphore descriptor + * (the type of which is dependent on the OS & external API) * - * @tparam ExternalSemaphoreHandleType Handle type describing external - * semaphore handle + * @tparam ResourceType Resource type differentiating external resource types * @param externalSemaphoreDesc External semaphore descriptor - * @param syclQueue The queue in which we create our interop semaphore - * handle - * @return Interop semaphore handle to the external semaphore + * @param syclQueue The queue in which we create our external semaphore + * @return Imported opaque external semaphore */ -template -__SYCL_EXPORT interop_semaphore_handle import_external_semaphore( - external_semaphore_descriptor - externalSemaphoreDesc, +template +__SYCL_EXPORT external_semaphore import_external_semaphore( + external_semaphore_descriptor externalSemaphoreDesc, const sycl::queue &syclQueue); /** * @brief Release the external semaphore * - * @param semaphoreHandle The interop semaphore handle to destroy - * @param syclDevice The device in which the interop semaphore handle was - * created - * @param syclContext The context in which the interop semaphore handle was - * created + * @param extSemaphore The external semaphore to destroy + * @param syclDevice The device in which the external semaphore was created + * @param syclContext The context in which the external semaphore was + * created */ -__SYCL_EXPORT void -release_external_semaphore(interop_semaphore_handle semaphoreHandle, - const sycl::device &syclDevice, - const sycl::context &syclContext); +__SYCL_EXPORT void release_external_semaphore(external_semaphore extSemaphore, + const sycl::device &syclDevice, + const sycl::context &syclContext); /** * @brief Release the external semaphore * - * @param semaphoreHandle The interop semaphore handle to destroy - * @param syclQueue The queue in which the interop semaphore handle was - * created + * @param extSemaphore The external semaphore to destroy + * @param syclQueue The queue in which the external semaphore was created */ -__SYCL_EXPORT void -release_external_semaphore(interop_semaphore_handle semaphoreHandle, - const sycl::queue &syclQueue); +__SYCL_EXPORT void release_external_semaphore(external_semaphore extSemaphore, + const sycl::queue &syclQueue); /** * @brief Release external memory * - * @param interopHandle The interop memory handle to release - * @param syclDevice The device in which the interop memory handle was - * created - * @param syclContext The context in which the interop memory handle was - * created + * @param externalMem The external memory to release + * @param syclDevice The device in which the external memory was created + * @param syclContext The context in which the external memory was created */ -__SYCL_EXPORT void release_external_memory(interop_mem_handle interopHandle, +__SYCL_EXPORT void release_external_memory(external_mem externalMem, const sycl::device &syclDevice, const sycl::context &syclContext); /** * @brief Release external memory * - * @param interopHandle The interop memory handle to release - * @param syclQueue The queue in which the interop memory handle was - * created + * @param externalMem The external memory to release + * @param syclQueue The queue in which the external memory was created */ -__SYCL_EXPORT void release_external_memory(interop_mem_handle interopHandle, +__SYCL_EXPORT void release_external_memory(external_mem externalMem, const sycl::queue &syclQueue); /** @@ -1642,7 +1626,7 @@ inline event queue::ext_oneapi_copy( } inline event queue::ext_oneapi_wait_external_semaphore( - sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + sycl::ext::oneapi::experimental::external_semaphore SemaphoreHandle, event DepEvent, const detail::code_location &CodeLoc) { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); return submit( @@ -1654,7 +1638,7 @@ inline event queue::ext_oneapi_wait_external_semaphore( } inline event queue::ext_oneapi_wait_external_semaphore( - sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + sycl::ext::oneapi::experimental::external_semaphore SemaphoreHandle, const std::vector &DepEvents, const detail::code_location &CodeLoc) { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); return submit( @@ -1666,7 +1650,7 @@ inline event queue::ext_oneapi_wait_external_semaphore( } inline event queue::ext_oneapi_wait_external_semaphore( - sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + sycl::ext::oneapi::experimental::external_semaphore SemaphoreHandle, uint64_t WaitValue, const detail::code_location &CodeLoc) { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); return submit( @@ -1677,7 +1661,7 @@ inline event queue::ext_oneapi_wait_external_semaphore( } inline event queue::ext_oneapi_wait_external_semaphore( - sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + sycl::ext::oneapi::experimental::external_semaphore SemaphoreHandle, uint64_t WaitValue, event DepEvent, const detail::code_location &CodeLoc) { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); return submit( @@ -1689,7 +1673,7 @@ inline event queue::ext_oneapi_wait_external_semaphore( } inline event queue::ext_oneapi_wait_external_semaphore( - sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + sycl::ext::oneapi::experimental::external_semaphore SemaphoreHandle, uint64_t WaitValue, const std::vector &DepEvents, const detail::code_location &CodeLoc) { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); @@ -1702,7 +1686,7 @@ inline event queue::ext_oneapi_wait_external_semaphore( } inline event queue::ext_oneapi_signal_external_semaphore( - sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + sycl::ext::oneapi::experimental::external_semaphore SemaphoreHandle, const detail::code_location &CodeLoc) { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); return submit( @@ -1713,7 +1697,7 @@ inline event queue::ext_oneapi_signal_external_semaphore( } inline event queue::ext_oneapi_signal_external_semaphore( - sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + sycl::ext::oneapi::experimental::external_semaphore SemaphoreHandle, event DepEvent, const detail::code_location &CodeLoc) { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); return submit( @@ -1725,7 +1709,7 @@ inline event queue::ext_oneapi_signal_external_semaphore( } inline event queue::ext_oneapi_signal_external_semaphore( - sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + sycl::ext::oneapi::experimental::external_semaphore SemaphoreHandle, const std::vector &DepEvents, const detail::code_location &CodeLoc) { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); return submit( @@ -1737,7 +1721,7 @@ inline event queue::ext_oneapi_signal_external_semaphore( } inline event queue::ext_oneapi_signal_external_semaphore( - sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + sycl::ext::oneapi::experimental::external_semaphore SemaphoreHandle, uint64_t SignalValue, const detail::code_location &CodeLoc) { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); return submit( @@ -1748,7 +1732,7 @@ inline event queue::ext_oneapi_signal_external_semaphore( } inline event queue::ext_oneapi_signal_external_semaphore( - sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + sycl::ext::oneapi::experimental::external_semaphore SemaphoreHandle, uint64_t SignalValue, event DepEvent, const detail::code_location &CodeLoc) { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); @@ -1761,7 +1745,7 @@ inline event queue::ext_oneapi_signal_external_semaphore( } inline event queue::ext_oneapi_signal_external_semaphore( - sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + sycl::ext::oneapi::experimental::external_semaphore SemaphoreHandle, uint64_t SignalValue, const std::vector &DepEvents, const detail::code_location &CodeLoc) { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); diff --git a/sycl/include/sycl/ext/oneapi/bindless_images_interop.hpp b/sycl/include/sycl/ext/oneapi/bindless_images_interop.hpp index a9307c069e15e..a0d6eef50c5f6 100644 --- a/sycl/include/sycl/ext/oneapi/bindless_images_interop.hpp +++ b/sycl/include/sycl/ext/oneapi/bindless_images_interop.hpp @@ -30,15 +30,15 @@ enum class external_semaphore_handle_type { win32_nt_dx12_fence = 2, }; -/// Opaque interop memory handle type -struct interop_mem_handle { - using raw_handle_type = ur_exp_interop_mem_handle_t; +/// Opaque external memory handle type +struct external_mem { + using raw_handle_type = ur_exp_external_mem_handle_t; raw_handle_type raw_handle; }; -/// Opaque interop semaphore handle type -struct interop_semaphore_handle { - using raw_handle_type = ur_exp_interop_semaphore_handle_t; +/// Imported opaque external semaphore +struct external_semaphore { + using raw_handle_type = ur_exp_external_semaphore_handle_t; raw_handle_type raw_handle; external_semaphore_handle_type handle_type; }; diff --git a/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp b/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp index 368d4ae98e35e..ed513ae3d2098 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp @@ -26,7 +26,7 @@ namespace detail { template uint32_t to_uint32_t(sycl::marray x, size_t start) { uint32_t res; - sycl::detail::memcpy(&res, &x[start], sizeof(uint32_t)); + sycl::detail::memcpy_no_adl(&res, &x[start], sizeof(uint32_t)); return res; } } // namespace detail @@ -112,7 +112,7 @@ sycl::marray fabs(sycl::marray x) { (__SYCL_CUDA_ARCH__ >= 800) for (size_t i = 0; i < N / 2; i++) { auto partial_res = __clc_fabs(detail::to_uint32_t(x, i * 2)); - sycl::detail::memcpy(&res[i * 2], &partial_res, sizeof(uint32_t)); + sycl::detail::memcpy_no_adl(&res[i * 2], &partial_res, sizeof(uint32_t)); } if (N % 2) { @@ -188,7 +188,7 @@ sycl::marray fmin(sycl::marray x, for (size_t i = 0; i < N / 2; i++) { auto partial_res = __clc_fmin(detail::to_uint32_t(x, i * 2), detail::to_uint32_t(y, i * 2)); - sycl::detail::memcpy(&res[i * 2], &partial_res, sizeof(uint32_t)); + sycl::detail::memcpy_no_adl(&res[i * 2], &partial_res, sizeof(uint32_t)); } if (N % 2) { @@ -270,7 +270,7 @@ sycl::marray fmax(sycl::marray x, for (size_t i = 0; i < N / 2; i++) { auto partial_res = __clc_fmax(detail::to_uint32_t(x, i * 2), detail::to_uint32_t(y, i * 2)); - sycl::detail::memcpy(&res[i * 2], &partial_res, sizeof(uint32_t)); + sycl::detail::memcpy_no_adl(&res[i * 2], &partial_res, sizeof(uint32_t)); } if (N % 2) { @@ -340,7 +340,7 @@ sycl::marray fma(sycl::marray x, auto partial_res = __clc_fma(detail::to_uint32_t(x, i * 2), detail::to_uint32_t(y, i * 2), detail::to_uint32_t(z, i * 2)); - sycl::detail::memcpy(&res[i * 2], &partial_res, sizeof(uint32_t)); + sycl::detail::memcpy_no_adl(&res[i * 2], &partial_res, sizeof(uint32_t)); } if (N % 2) { diff --git a/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp b/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp index 6ba75098bd534..facc486ca2f84 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp @@ -126,7 +126,7 @@ inline __SYCL_ALWAYS_INLINE #else auto partial_res = sycl::tanh(sycl::detail::to_vec2(x, i * 2)); #endif - sycl::detail::memcpy(&res[i * 2], &partial_res, sizeof(vec)); + sycl::detail::memcpy_no_adl(&res[i * 2], &partial_res, sizeof(vec)); } if (N % 2) { #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) @@ -167,7 +167,7 @@ exp2(sycl::marray x) __NOEXC { #else auto partial_res = sycl::exp2(sycl::detail::to_vec2(x, i * 2)); #endif - sycl::detail::memcpy(&res[i * 2], &partial_res, sizeof(vec)); + sycl::detail::memcpy_no_adl(&res[i * 2], &partial_res, sizeof(vec)); } if (N % 2) { #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) diff --git a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp index 1394f1f77e139..7a3bef52110db 100644 --- a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp +++ b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp @@ -111,7 +111,7 @@ struct sub_group_mask { size_t RemainingBytes = sizeof(Bits) - BytesCopied; size_t BytesToCopy = RemainingBytes < sizeof(T) ? RemainingBytes : sizeof(T); - sycl::detail::memcpy(reinterpret_cast(&Bits) + BytesCopied, + sycl::detail::memcpy_no_adl(reinterpret_cast(&Bits) + BytesCopied, &val[I], BytesToCopy); BytesCopied += BytesToCopy; } diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 9d4038a318b78..6f2e9f9fc19b7 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -3303,45 +3303,45 @@ class __SYCL_EXPORT handler { /// Submit a non-blocking device-side wait on an external // semaphore to the queue. - /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the + /// An exception is thrown if \p extSemaphore is incomplete, or if the /// type of semaphore requires an explicit value to wait upon. /// - /// \param SemaphoreHandle is an opaque external interop semaphore handle + /// \param extSemaphore is an opaque external semaphore object void ext_oneapi_wait_external_semaphore( - ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle); + sycl::ext::oneapi::experimental::external_semaphore extSemaphore); /// Submit a non-blocking device-side wait on an external // semaphore to the queue. - /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the + /// An exception is thrown if \p extSemaphore is incomplete, or if the /// type of semaphore does not support waiting on an explicitly passed value. /// - /// \param SemaphoreHandle is an opaque external interop semaphore handle + /// \param extSemaphore is an opaque external semaphore object /// \param WaitValue is the value that this semaphore will wait upon, until it /// allows any further commands to execute on the queue. void ext_oneapi_wait_external_semaphore( - ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + sycl::ext::oneapi::experimental::external_semaphore extSemaphore, uint64_t WaitValue); /// Instruct the queue to signal the external semaphore once all previous /// commands submitted to the queue have completed execution. - /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the + /// An exception is thrown if \p extSemaphore is incomplete, or if the /// type of semaphore requires an explicit value to signal. /// - /// \param SemaphoreHandle is an opaque external interop semaphore handle + /// \param extSemaphore is an opaque external semaphore object void ext_oneapi_signal_external_semaphore( - ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle); + sycl::ext::oneapi::experimental::external_semaphore extSemaphore); /// Instruct the queue to set the state of the external semaphore to /// \p SignalValue once all previous commands submitted to the queue have /// completed execution. - /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the + /// An exception is thrown if \p extSemaphore is incomplete, or if the /// type of semaphore does not support signalling an explicitly passed value. /// - /// \param SemaphoreHandle is an opaque external interop semaphore handle + /// \param extSemaphore is an opaque external semaphore object. /// \param SignalValue is the value that this semaphore signal, once all /// prior opeartions on the queue complete. void ext_oneapi_signal_external_semaphore( - ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + sycl::ext::oneapi::experimental::external_semaphore extSemaphore, uint64_t SignalValue); private: diff --git a/sycl/include/sycl/info/aspects.def b/sycl/include/sycl/info/aspects.def index c1917cf1d7d9d..87d8c8643d422 100644 --- a/sycl/include/sycl/info/aspects.def +++ b/sycl/include/sycl/info/aspects.def @@ -39,8 +39,8 @@ __SYCL_ASPECT(ext_oneapi_bindless_images, 42) __SYCL_ASPECT(ext_oneapi_bindless_images_shared_usm, 43) __SYCL_ASPECT(ext_oneapi_bindless_images_1d_usm, 44) __SYCL_ASPECT(ext_oneapi_bindless_images_2d_usm, 45) -__SYCL_ASPECT(ext_oneapi_interop_memory_import, 46) -__SYCL_ASPECT(ext_oneapi_interop_semaphore_import, 48) +__SYCL_ASPECT(ext_oneapi_external_memory_import, 46) +__SYCL_ASPECT(ext_oneapi_external_semaphore_import, 48) __SYCL_ASPECT(ext_oneapi_mipmap, 50) __SYCL_ASPECT(ext_oneapi_mipmap_anisotropy, 51) __SYCL_ASPECT(ext_oneapi_mipmap_level_reference, 52) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 706ca59ea854b..fbab1e5ca9148 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -1840,169 +1840,169 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const detail::code_location &CodeLoc = detail::code_location::current()); /// Instruct the queue with a non-blocking wait on an external semaphore. - /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the + /// An exception is thrown if \p extSemaphore is incomplete, or if the /// type of semaphore requires an explicit value to wait upon. /// - /// \param SemaphoreHandle is an opaque external interop semaphore handle. + /// \param extSemaphore is an opaque external semaphore object. /// \return an event representing the wait operation. event ext_oneapi_wait_external_semaphore( - sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + sycl::ext::oneapi::experimental::external_semaphore extSemaphore, const detail::code_location &CodeLoc = detail::code_location::current()) { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); return submit( [&](handler &CGH) { - CGH.ext_oneapi_wait_external_semaphore(SemaphoreHandle); + CGH.ext_oneapi_wait_external_semaphore(extSemaphore); }, CodeLoc); } /// Instruct the queue with a non-blocking wait on an external semaphore. - /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the + /// An exception is thrown if \p extSemaphore is incomplete, or if the /// type of semaphore requires an explicit value to wait upon. /// - /// \param SemaphoreHandle is an opaque external interop semaphore handle + /// \param extSemaphore is an opaque external semaphore object /// \param DepEvent is an event that specifies the kernel dependencies. /// \return an event representing the wait operation. event ext_oneapi_wait_external_semaphore( - sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + sycl::ext::oneapi::experimental::external_semaphore extSemaphore, event DepEvent, const detail::code_location &CodeLoc = detail::code_location::current()); /// Instruct the queue with a non-blocking wait on an external semaphore. - /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the + /// An exception is thrown if \p extSemaphore is incomplete, or if the /// type of semaphore requires an explicit value to wait upon. /// - /// \param SemaphoreHandle is an opaque external interop semaphore handle. + /// \param extSemaphore is an opaque external semaphore object. /// \param DepEvents is a vector of events that specifies the kernel /// dependencies. /// \return an event representing the wait operation. event ext_oneapi_wait_external_semaphore( - sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + sycl::ext::oneapi::experimental::external_semaphore extSemaphore, const std::vector &DepEvents, const detail::code_location &CodeLoc = detail::code_location::current()); /// Instruct the queue with a non-blocking wait on an external semaphore. - /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the + /// An exception is thrown if \p extSemaphore is incomplete, or if the /// type of semaphore does not support waiting on an explicitly passed value. /// - /// \param SemaphoreHandle is an opaque external interop semaphore handle + /// \param extSemaphore is an opaque external semaphore object /// \param WaitValue is the value that this semaphore will wait upon, until it /// allows any further commands to execute on the queue. /// \return an event representing the wait operation. event ext_oneapi_wait_external_semaphore( - sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + sycl::ext::oneapi::experimental::external_semaphore extSemaphore, uint64_t WaitValue, const detail::code_location &CodeLoc = detail::code_location::current()); /// Instruct the queue with a non-blocking wait on an external semaphore. - /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the + /// An exception is thrown if \p extSemaphore is incomplete, or if the /// type of semaphore does not support waiting on an explicitly passed value. /// - /// \param SemaphoreHandle is an opaque external interop semaphore handle + /// \param extSemaphore is an opaque external semaphore object /// \param WaitValue is the value that this semaphore will wait upon, until it /// allows any further commands to execute on the queue. /// \param DepEvent is an event that specifies the kernel dependencies. /// \return an event representing the wait operation. event ext_oneapi_wait_external_semaphore( - sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + sycl::ext::oneapi::experimental::external_semaphore extSemaphore, uint64_t WaitValue, event DepEvent, const detail::code_location &CodeLoc = detail::code_location::current()); /// Instruct the queue with a non-blocking wait on an external semaphore. - /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the + /// An exception is thrown if \p extSemaphore is incomplete, or if the /// type of semaphore does not support waiting on an explicitly passed value. /// - /// \param SemaphoreHandle is an opaque external interop semaphore handle + /// \param extSemaphore is an opaque external semaphore object /// \param WaitValue is the value that this semaphore will wait upon, until it /// allows any further commands to execute on the queue. /// \param DepEvents is a vector of events that specifies the kernel /// dependencies. /// \return an event representing the wait operation. event ext_oneapi_wait_external_semaphore( - sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + sycl::ext::oneapi::experimental::external_semaphore extSemaphore, uint64_t WaitValue, const std::vector &DepEvents, const detail::code_location &CodeLoc = detail::code_location::current()); /// Instruct the queue to signal the external semaphore once all previous /// commands have completed execution. - /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the + /// An exception is thrown if \p extSemaphore is incomplete, or if the /// type of semaphore requires an explicit value to signal. /// - /// \param SemaphoreHandle is an opaque external interop semaphore handle + /// \param extSemaphore is an opaque external semaphore object /// \return an event representing the signal operation. event ext_oneapi_signal_external_semaphore( - sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + sycl::ext::oneapi::experimental::external_semaphore extSemaphore, const detail::code_location &CodeLoc = detail::code_location::current()); /// Instruct the queue to signal the external semaphore once all previous /// commands have completed execution. - /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the + /// An exception is thrown if \p extSemaphore is incomplete, or if the /// type of semaphore requires an explicit value to signal. /// - /// \param SemaphoreHandle is an opaque external interop semaphore handle + /// \param extSemaphore is an opaque external semaphore object /// \param DepEvent is an event that specifies the kernel dependencies. /// \return an event representing the signal operation. event ext_oneapi_signal_external_semaphore( - sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + sycl::ext::oneapi::experimental::external_semaphore extSemaphore, event DepEvent, const detail::code_location &CodeLoc = detail::code_location::current()); /// Instruct the queue to signal the external semaphore once all previous /// commands have completed execution. - /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the + /// An exception is thrown if \p extSemaphore is incomplete, or if the /// type of semaphore requires an explicit value to signal. /// - /// \param SemaphoreHandle is an opaque external interop semaphore handle + /// \param extSemaphore is an opaque external semaphore object /// \param DepEvents is a vector of events that specifies the kernel /// dependencies. /// \return an event representing the signal operation. event ext_oneapi_signal_external_semaphore( - sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + sycl::ext::oneapi::experimental::external_semaphore extSemaphore, const std::vector &DepEvents, const detail::code_location &CodeLoc = detail::code_location::current()); /// Instruct the queue to signal the external semaphore once all previous /// commands have completed execution. - /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the + /// An exception is thrown if \p extSemaphore is incomplete, or if the /// type of semaphore does not support signalling an explicitly passed value. /// - /// \param SemaphoreHandle is an opaque external interop semaphore handle + /// \param extSemaphore is an opaque external semaphore object /// \param SignalValue is the value that this semaphore signal, once all /// prior opeartions on the queue complete. /// \return an event representing the signal operation. event ext_oneapi_signal_external_semaphore( - sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + sycl::ext::oneapi::experimental::external_semaphore extSemaphore, uint64_t SignalValue, const detail::code_location &CodeLoc = detail::code_location::current()); /// Instruct the queue to signal the external semaphore once all previous /// commands have completed execution. - /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the + /// An exception is thrown if \p extSemaphore is incomplete, or if the /// type of semaphore does not support signalling an explicitly passed value. /// - /// \param SemaphoreHandle is an opaque external interop semaphore handle + /// \param extSemaphore is an opaque external semaphore object /// \param SignalValue is the value that this semaphore signal, once all /// prior opeartions on the queue complete. /// \param DepEvent is an event that specifies the kernel dependencies. /// \return an event representing the signal operation. event ext_oneapi_signal_external_semaphore( - sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + sycl::ext::oneapi::experimental::external_semaphore extSemaphore, uint64_t SignalValue, event DepEvent, const detail::code_location &CodeLoc = detail::code_location::current()); /// Instruct the queue to signal the external semaphore once all previous /// commands have completed execution. - /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the + /// An exception is thrown if \p extSemaphore is incomplete, or if the /// type of semaphore does not support signalling an explicitly passed value. /// - /// \param SemaphoreHandle is an opaque external interop semaphore handle + /// \param extSemaphore is an opaque external semaphore object /// \param SignalValue is the value that this semaphore signal, once all /// prior opeartions on the queue complete. /// \param DepEvents is a vector of events that specifies the kernel /// dependencies. /// \return an event representing the signal operation. event ext_oneapi_signal_external_semaphore( - sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + sycl::ext::oneapi::experimental::external_semaphore SemaphoreHandle, uint64_t SignalValue, const std::vector &DepEvents, const detail::code_location &CodeLoc = detail::code_location::current()); diff --git a/sycl/include/syclcompat/launch.hpp b/sycl/include/syclcompat/launch.hpp index 503f29ff8b91f..eb5d774bc12d3 100644 --- a/sycl/include/syclcompat/launch.hpp +++ b/sycl/include/syclcompat/launch.hpp @@ -31,6 +31,7 @@ #include #include +#include namespace syclcompat { @@ -67,26 +68,6 @@ launch(const sycl::nd_range<3> &range, sycl::queue q, Args... args) { range, [=](sycl::nd_item<3>) { [[clang::always_inline]] F(args...); }); } -template -sycl::event launch(const sycl::nd_range<3> &range, size_t mem_size, - sycl::queue q, Args... args) { - static_assert(detail::getArgumentCount(F) == sizeof...(args) + 1, - "Wrong number of arguments to SYCL kernel"); - - using F_t = decltype(F); - using f_return_t = typename std::invoke_result_t; - static_assert(std::is_same::value, - "SYCL kernels should return void"); - - return q.submit([&](sycl::handler &cgh) { - auto local_acc = sycl::local_accessor(mem_size, cgh); - cgh.parallel_for(range, [=](sycl::nd_item<3>) { - auto local_mem = local_acc.get_pointer(); - [[clang::always_inline]] F(args..., local_mem); - }); - }); -} - } // namespace detail template @@ -137,87 +118,47 @@ launch(const dim3 &grid, const dim3 &threads, Args... args) { return launch(grid, threads, get_default_queue(), args...); } -/// Launches a kernel with the templated F param and arguments on a -/// device specified by the given nd_range and SYCL queue. -/// @tparam F SYCL kernel to be executed, expects signature F(T* local_mem, -/// Args... args). -/// @tparam Dim nd_range dimension number. -/// @tparam Args Types of the arguments to be passed to the kernel. -/// @param range Nd_range specifying the work group and global sizes for the -/// kernel. -/// @param q The SYCL queue on which to execute the kernel. -/// @param mem_size The size, in number of bytes, of the local -/// memory to be allocated for kernel. -/// @param args The arguments to be passed to the kernel. -/// @return A SYCL event object that can be used to synchronize with the -/// kernel's execution. -template -sycl::event launch(const sycl::nd_range &range, size_t mem_size, - sycl::queue q, Args... args) { - return detail::launch(detail::transform_nd_range(range), mem_size, q, - args...); +} // namespace syclcompat + +namespace syclcompat::experimental { + +namespace detail { + +template +sycl::event launch(LaunchPolicy launch_policy, sycl::queue q, Args... args) { + static_assert(syclcompat::args_compatible, + "Mismatch between device function signature and supplied " + "arguments. Have you correctly handled local memory/char*?"); + + sycl_exp::launch_config config(launch_policy.get_range(), + launch_policy.get_launch_properties()); + + return sycl_exp::submit_with_event(q, [&](sycl::handler &cgh) { + auto KernelFunctor = build_kernel_functor(cgh, launch_policy, args...); + if constexpr (syclcompat::detail::is_range_v< + typename LaunchPolicy::RangeT>) { + parallel_for(cgh, config, KernelFunctor); + } else { + static_assert( + syclcompat::detail::is_nd_range_v); + nd_launch(cgh, config, KernelFunctor); + } + }); } -/// Launches a kernel with the templated F param and arguments on a -/// device specified by the given nd_range using theSYCL default queue. -/// @tparam F SYCL kernel to be executed, expects signature F(T* local_mem, -/// Args... args). -/// @tparam Dim nd_range dimension number. -/// @tparam Args Types of the arguments to be passed to the kernel. -/// @param range Nd_range specifying the work group and global sizes for the -/// kernel. -/// @param mem_size The size, in number of bytes, of the local -/// memory to be allocated for kernel. -/// @param args The arguments to be passed to the kernel. -/// @return A SYCL event object that can be used to synchronize with the -/// kernel's execution. -template -sycl::event launch(const sycl::nd_range &range, size_t mem_size, - Args... args) { - return launch(range, mem_size, get_default_queue(), args...); } -/// Launches a kernel with the templated F param and arguments on a -/// device with a user-specified grid and block dimensions following the -/// standard of other programming models using a user-defined SYCL queue. -/// @tparam F SYCL kernel to be executed, expects signature F(T* local_mem, -/// Args... args). -/// @tparam Dim nd_range dimension number. -/// @tparam Args Types of the arguments to be passed to the kernel. -/// @param grid Grid dimensions represented with an (x, y, z) iteration space. -/// @param threads Block dimensions represented with an (x, y, z) iteration -/// space. -/// @param mem_size The size, in number of bytes, of the local -/// memory to be allocated for kernel. -/// @param args The arguments to be passed to the kernel. -/// @return A SYCL event object that can be used to synchronize with the -/// kernel's execution. -template -sycl::event launch(const dim3 &grid, const dim3 &threads, size_t mem_size, - sycl::queue q, Args... args) { - return launch(sycl::nd_range<3>{grid * threads, threads}, mem_size, q, - args...); + +template +sycl::event launch(LaunchPolicy launch_policy, sycl::queue q, Args... args) { + static_assert(detail::is_launch_policy_v); + return detail::launch(launch_policy, q, args...); } -/// Launches a kernel with the templated F param and arguments on a -/// device with a user-specified grid and block dimensions following the -/// standard of other programming models using the default SYCL queue. -/// @tparam F SYCL kernel to be executed, expects signature F(T* local_mem, -/// Args... args). -/// @tparam Dim nd_range dimension number. -/// @tparam Args Types of the arguments to be passed to the kernel. -/// @param grid Grid dimensions represented with an (x, y, z) iteration space. -/// @param threads Block dimensions represented with an (x, y, z) iteration -/// space. -/// @param mem_size The size, in number of bytes, of the -/// local memory to be allocated. -/// @param args The arguments to be passed to the kernel. -/// @return A SYCL event object that can be used to synchronize with the -/// kernel's execution. -template -sycl::event launch(const dim3 &grid, const dim3 &threads, size_t mem_size, - Args... args) { - return launch(grid, threads, mem_size, get_default_queue(), args...); +template +sycl::event launch(LaunchPolicy launch_policy, Args... args) { + static_assert(detail::is_launch_policy_v); + return launch(launch_policy, get_default_queue(), args...); } -} // namespace syclcompat +} // namespace syclcompat::experimental diff --git a/sycl/include/syclcompat/launch_experimental.hpp b/sycl/include/syclcompat/launch_experimental.hpp deleted file mode 100644 index 3074c8c20371e..0000000000000 --- a/sycl/include/syclcompat/launch_experimental.hpp +++ /dev/null @@ -1,105 +0,0 @@ -/*************************************************************************** - * - * Copyright (C) Codeplay Software Ltd. - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM - * Exceptions. See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * SYCLcompat - * - * launch_experimental.hpp - * - * Description: - * Launch Overloads with accepting required subgroup size - **************************************************************************/ - -#pragma once - -#include -#include -#include - -namespace syclcompat { -namespace experimental { - -//================================================================================================// -// Overloads using Local Memory // -//================================================================================================// - -template -std::enable_if_t, sycl::event> -launch(sycl::nd_range<3> launch_range, std::size_t local_memory_size, - sycl::queue queue, Args... args) { - return queue.submit([&](sycl::handler &cgh) { - sycl::local_accessor loc(local_memory_size, cgh); - cgh.parallel_for( - launch_range, - [=](sycl::nd_item<3> it) [[sycl::reqd_sub_group_size(SubgroupSize)]] { - [[clang::always_inline]] F( - args..., loc.get_multi_ptr()); - }); - }); -} - -template -std::enable_if_t, sycl::event> -launch(sycl::nd_range launch_range, std::size_t local_memory_size, - Args... args) { - return launch( - ::syclcompat::detail::transform_nd_range(launch_range), local_memory_size, - ::syclcompat::get_default_queue(), args...); -} - -template -std::enable_if_t, sycl::event> -launch(::syclcompat::dim3 grid_dim, ::syclcompat::dim3 block_dim, - std::size_t local_memory_size, Args... args) { - return launch( - ::syclcompat::detail::transform_nd_range(sycl::nd_range( - sycl::range<3>(grid_dim * block_dim), sycl::range<3>(block_dim))), - local_memory_size, ::syclcompat::get_default_queue(), args...); -} - -//================================================================================================// -// Overloads not using Local Memory // -//================================================================================================// - -template -std::enable_if_t, sycl::event> -launch(sycl::nd_range<3> launch_range, sycl::queue queue, Args... args) { - return queue.submit([&](sycl::handler &cgh) { - cgh.parallel_for(launch_range, - [=](sycl::nd_item<3> it) - [[sycl::reqd_sub_group_size(SubgroupSize)]] { - [[clang::always_inline]] F(args...); - }); - }); -} - -template -std::enable_if_t, sycl::event> -launch(sycl::nd_range launch_range, Args... args) { - return launch( - ::syclcompat::detail::transform_nd_range(launch_range), - ::syclcompat::get_default_queue(), args...); -} - -template -std::enable_if_t, sycl::event> -launch(::syclcompat::dim3 grid_dim, ::syclcompat::dim3 block_dim, - Args... args) { - return launch( - ::syclcompat::detail::transform_nd_range(sycl::nd_range( - sycl::range<3>(grid_dim * block_dim), sycl::range<3>(block_dim))), - ::syclcompat::get_default_queue(), args...); -} - -} // namespace experimental -} // namespace syclcompat diff --git a/sycl/include/syclcompat/launch_policy.hpp b/sycl/include/syclcompat/launch_policy.hpp new file mode 100644 index 0000000000000..1c5f6ed3e97d6 --- /dev/null +++ b/sycl/include/syclcompat/launch_policy.hpp @@ -0,0 +1,254 @@ +/*************************************************************************** + * + * Copyright (C) Codeplay Software Ltd. + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM + * Exceptions. See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SYCL compatibility extension + * + * launch.hpp + * + * Description: + * launch functionality for the SYCL compatibility extension + **************************************************************************/ + +#pragma once + +#include "sycl/ext/oneapi/experimental/enqueue_functions.hpp" +#include "sycl/ext/oneapi/properties/properties.hpp" +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace syclcompat { +namespace experimental { + +namespace sycl_exp = sycl::ext::oneapi::experimental; + +// Wrapper for kernel sycl_exp::properties +template struct kernel_properties { + static_assert(sycl_exp::is_property_list_v); + using Props = Properties; + + template + kernel_properties(Props... properties) : props{properties...} {} + + template + kernel_properties(sycl_exp::properties properties) + : props{properties} {} + + Properties props; +}; + +template ::value, void>> +kernel_properties(Props... props) + -> kernel_properties; + +template +kernel_properties(sycl_exp::properties props) + -> kernel_properties>; + +// Wrapper for launch sycl_exp::properties +template struct launch_properties { + static_assert(sycl_exp::is_property_list_v); + using Props = Properties; + + template + launch_properties(Props... properties) : props{properties...} {} + + template + launch_properties(sycl_exp::properties properties) + : props{properties} {} + + Properties props; +}; + +template ::value, void>> +launch_properties(Props... props) + -> launch_properties; + +template +launch_properties(sycl_exp::properties props) + -> launch_properties>; + +// Wrapper for local memory size +struct local_mem_size { + local_mem_size(size_t size = 0) : size{size} {}; + size_t size; +}; + +// launch_policy is constructed by the user & passed to `compat_exp::launch` +template +class launch_policy { + static_assert(sycl_exp::is_property_list_v); + static_assert(sycl_exp::is_property_list_v); + static_assert(syclcompat::detail::is_range_or_nd_range_v); + static_assert(syclcompat::detail::is_nd_range_v || !LocalMem, + "sycl::range kernel launches are incompatible with local " + "memory usage!"); + +public: + using KPropsT = KProps; + using LPropsT = LProps; + using RangeT = Range; + static constexpr bool HasLocalMem = LocalMem; + +private: + launch_policy() = default; + + template + launch_policy(Ts... ts) + : _kernel_properties{detail::property_getter< + kernel_properties, kernel_properties, std::tuple>()( + std::tuple(ts...))}, + _launch_properties{detail::property_getter< + launch_properties, launch_properties, std::tuple>()( + std::tuple(ts...))}, + _local_mem_size{ + detail::local_mem_getter>()( + std::tuple(ts...))} { + check_variadic_args(ts...); + } + + template void check_variadic_args(Ts...) { + static_assert( + std::conjunction_v, + detail::is_launch_properties, + detail::is_local_mem_size>...>, + "Received an unexpected argument to ctor. Did you forget to wrap " + "in " + "compat::kernel_properties, launch_properties, local_mem_size?"); + } + +public: + template + launch_policy(Range range, Ts... ts) : launch_policy(ts...) { + _range = range; + check_variadic_args(ts...); + } + + template + launch_policy(dim3 global_range, Ts... ts) : launch_policy(ts...) { + _range = Range{global_range}; + check_variadic_args(ts...); + } + + template + launch_policy(dim3 global_range, dim3 local_range, Ts... ts) + : launch_policy(ts...) { + _range = Range{global_range * local_range, local_range}; + check_variadic_args(ts...); + } + + KProps get_kernel_properties() { return _kernel_properties.props; } + LProps get_launch_properties() { return _launch_properties.props; } + size_t get_local_mem_size() { return _local_mem_size.size; } + Range get_range() { return _range; } + +private: + Range _range; + kernel_properties _kernel_properties; + launch_properties _launch_properties; + local_mem_size _local_mem_size; +}; + +// Deduction guides for launch_policy +template +launch_policy(Range, Ts...) -> launch_policy< + Range, detail::properties_or_empty, + detail::properties_or_empty, + detail::has_type>::value>; + +template +launch_policy(sycl::range, sycl::range, Ts...) -> launch_policy< + sycl::nd_range, detail::properties_or_empty, + detail::properties_or_empty, + detail::has_type>::value>; + +template +launch_policy(dim3, Ts...) -> launch_policy< + sycl::range<3>, detail::properties_or_empty, + detail::properties_or_empty, + detail::has_type>::value>; + +template +launch_policy(dim3, dim3, Ts...) -> launch_policy< + sycl::nd_range<3>, detail::properties_or_empty, + detail::properties_or_empty, + detail::has_type>::value>; + +namespace detail { + +template +struct KernelFunctor { + KernelFunctor(KProps kernel_props, Args... args) + : _kernel_properties{kernel_props}, + _argument_tuple(std::make_tuple(args...)) {} + + KernelFunctor(KProps kernel_props, sycl::local_accessor local_acc, + Args... args) + : _kernel_properties{kernel_props}, _local_acc{local_acc}, + _argument_tuple(std::make_tuple(args...)) {} + + auto get(sycl_exp::properties_tag) { return _kernel_properties; } + + __syclcompat_inline__ void + operator()(syclcompat::detail::range_to_item_t) const { + if constexpr (HasLocalMem) { + char *local_mem_ptr = static_cast( + _local_acc.template get_multi_ptr().get()); + std::apply( + [lmem_ptr = local_mem_ptr](auto &&...args) { F(args..., lmem_ptr); }, + _argument_tuple); + } else { + std::apply([](auto &&...args) { F(args...); }, _argument_tuple); + } + } + + KProps _kernel_properties; + std::tuple _argument_tuple; + std::conditional_t, std::monostate> + _local_acc; // monostate for empty type +}; + +//==================================================================== +// This helper function avoids 2 nested `if constexpr` in detail::launch +template +auto build_kernel_functor(sycl::handler &cgh, LaunchPolicy launch_policy, + Args... args) + -> KernelFunctor { + if constexpr (LaunchPolicy::HasLocalMem) { + sycl::local_accessor local_memory( + launch_policy.get_local_mem_size(), cgh); + return KernelFunctor( + launch_policy.get_kernel_properties(), local_memory, args...); + } else { + return KernelFunctor( + launch_policy.get_kernel_properties(), args...); + } +} + +} // namespace detail +} // namespace experimental +} // namespace syclcompat diff --git a/sycl/include/syclcompat/syclcompat.hpp b/sycl/include/syclcompat/syclcompat.hpp index 401b5681d40dd..8c5f693794948 100644 --- a/sycl/include/syclcompat/syclcompat.hpp +++ b/sycl/include/syclcompat/syclcompat.hpp @@ -29,7 +29,6 @@ #include #include #include -#include #include #include #include diff --git a/sycl/include/syclcompat/traits.hpp b/sycl/include/syclcompat/traits.hpp index f992c67bae8ca..2f389ccf79484 100644 --- a/sycl/include/syclcompat/traits.hpp +++ b/sycl/include/syclcompat/traits.hpp @@ -23,6 +23,10 @@ #pragma once #include +#include +#include +#include +#include #include namespace syclcompat { @@ -41,4 +45,209 @@ template struct arith { }; template using arith_t = typename arith::type; +// Traits to check device function signature matches args (with or without local +// mem) +template +struct device_fn_invocable : std::is_invocable {}; + +template +struct device_fn_lmem_invocable + : std::is_invocable {}; + +template +constexpr inline bool args_compatible = + std::conditional_t, + device_fn_invocable>::value; + +namespace detail { + +// Trait for identifying sycl::range and sycl::nd_range. +template struct is_range : std::false_type {}; +template struct is_range> : std::true_type {}; + +template constexpr bool is_range_v = is_range::value; + +template struct is_nd_range : std::false_type {}; +template struct is_nd_range> : std::true_type {}; + +template constexpr bool is_nd_range_v = is_nd_range::value; + +template +constexpr bool is_range_or_nd_range_v = + std::disjunction_v, is_nd_range>; + +// Trait range_to_item_t to convert nd_range -> nd_item, range -> item +template struct range_to_item_map; +template struct range_to_item_map> { + using ItemT = sycl::nd_item; +}; +template struct range_to_item_map> { + using ItemT = sycl::item; +}; + +template +using range_to_item_t = typename range_to_item_map::ItemT; + +} // namespace detail + +// Forward decls +namespace experimental { + +template struct kernel_properties; +template struct launch_properties; +struct local_mem_size; + +template +class launch_policy; +} // namespace experimental + +namespace experimental::detail { + +// Helper for tuple_template_index +template