diff --git a/cmake/anydsl_runtime-config.cmake.in b/cmake/anydsl_runtime-config.cmake.in index 0414fcbe..65045672 100644 --- a/cmake/anydsl_runtime-config.cmake.in +++ b/cmake/anydsl_runtime-config.cmake.in @@ -273,6 +273,7 @@ function(anydsl_runtime_wrap outfiles) ${AnyDSL_runtime_ROOT_DIR}/platforms/${_frontend}/intrinsics_nvvm.impala ${AnyDSL_runtime_ROOT_DIR}/platforms/${_frontend}/intrinsics_amdgpu.impala ${AnyDSL_runtime_ROOT_DIR}/platforms/${_frontend}/intrinsics_opencl.impala + ${AnyDSL_runtime_ROOT_DIR}/platforms/${_frontend}/intrinsics_spirv.impala ${AnyDSL_runtime_ROOT_DIR}/platforms/${_frontend}/intrinsics_thorin.impala ${AnyDSL_runtime_ROOT_DIR}/platforms/${_frontend}/runtime.impala ${_additional_platform_files}) diff --git a/platforms/artic/intrinsics_opencl.impala b/platforms/artic/intrinsics_opencl.impala index b2369746..0a466111 100644 --- a/platforms/artic/intrinsics_opencl.impala +++ b/platforms/artic/intrinsics_opencl.impala @@ -55,6 +55,7 @@ #[import(cc = "device", name = "min")] fn opencl_min(i32, i32) -> i32; #[import(cc = "device", name = "max")] fn opencl_max(i32, i32) -> i32; #[import(cc = "device", name = "atomic_add")] fn opencl_atomic_add_global(&mut addrspace(1)i32, i32) -> i32; +#[import(cc = "device", name = "atomic_add")] fn opencl_atomic_add_global_f32(&mut addrspace(1)f32, f32) -> f32; #[import(cc = "device", name = "atomic_add")] fn opencl_atomic_add_shared(&mut addrspace(3)i32, i32) -> i32; #[import(cc = "device", name = "atomic_min")] fn opencl_atomic_min_global(&mut addrspace(1)i32, i32) -> i32; #[import(cc = "device", name = "atomic_min")] fn opencl_atomic_min_shared(&mut addrspace(3)i32, i32) -> i32; @@ -100,6 +101,43 @@ fn @opencl_accelerator(dev: i32) = Accelerator { barrier = @|| opencl_barrier(CLK_LOCAL_MEM_FENCE), }; +fn spv_cl_get_num_groups() = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](24 /* BuiltInNumWorkgroups */); +fn spv_cl_get_local_size() = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](25 /* BuiltInWorkgroupSize */); +fn spv_cl_get_group_id() = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](26 /* BuiltInWorkgroupId */); +fn spv_cl_get_local_id() = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](27 /* BuiltInLocalInvocationId */); +fn spv_cl_get_global_id() = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](28 /* BuiltInGlobalInvocationId */); +fn spv_cl_get_global_size() = *spirv_get_builtin[&mut addrspace(8) simd[u64 * 3]](31 /* BuiltInGlobalSize */); + +fn @opencl_spirv_accelerator(dev: i32) = Accelerator { + exec = @|body| |grid, block| { + let work_item = WorkItem { + tidx = @|| spv_cl_get_local_id()(0) as i32, + tidy = @|| spv_cl_get_local_id()(1) as i32, + tidz = @|| spv_cl_get_local_id()(2) as i32, + bidx = @|| spv_cl_get_local_id()(0) as i32, + bidy = @|| spv_cl_get_group_id()(1) as i32, + bidz = @|| spv_cl_get_group_id()(2) as i32, + gidx = @|| spv_cl_get_global_id()(0) as i32, + gidy = @|| spv_cl_get_global_id()(1) as i32, + gidz = @|| spv_cl_get_global_id()(2) as i32, + bdimx = @|| spv_cl_get_local_size()(0) as i32, + bdimy = @|| spv_cl_get_local_size()(1) as i32, + bdimz = @|| spv_cl_get_local_size()(2) as i32, + gdimx = @|| spv_cl_get_global_size()(0) as i32, + gdimy = @|| spv_cl_get_global_size()(1) as i32, + gdimz = @|| spv_cl_get_global_size()(2) as i32, + nblkx = @|| spv_cl_get_num_groups()(0) as i32, + nblky = @|| spv_cl_get_num_groups()(1) as i32, + nblkz = @|| spv_cl_get_num_groups()(2) as i32 + }; + opencl_spirv(dev, grid, block, || @body(work_item)) + }, + sync = @|| synchronize_opencl(dev), + alloc = @|size| alloc_opencl(dev, size), + alloc_unified = @|size| alloc_opencl_unified(dev, size), + barrier = @|| opencl_barrier(CLK_LOCAL_MEM_FENCE), +}; + static opencl_intrinsics = Intrinsics { expf = opencl_expf, exp2f = opencl_exp2f, diff --git a/platforms/artic/intrinsics_spirv.impala b/platforms/artic/intrinsics_spirv.impala new file mode 100644 index 00000000..570ff470 --- /dev/null +++ b/platforms/artic/intrinsics_spirv.impala @@ -0,0 +1 @@ +#[import(cc = "device", name = "spirv.builtin")] fn spirv_get_builtin[T](i32) -> T; \ No newline at end of file diff --git a/platforms/artic/intrinsics_thorin.impala b/platforms/artic/intrinsics_thorin.impala index 93d28e60..d174d82d 100644 --- a/platforms/artic/intrinsics_thorin.impala +++ b/platforms/artic/intrinsics_thorin.impala @@ -15,6 +15,7 @@ #[import(cc = "thorin")] fn cuda(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> (); #[import(cc = "thorin")] fn nvvm(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> (); #[import(cc = "thorin")] fn opencl(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> (); +#[import(cc = "thorin")] fn opencl_spirv(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> (); #[import(cc = "thorin")] fn amdgpu_hsa(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> (); #[import(cc = "thorin")] fn amdgpu_pal(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> (); #[import(cc = "thorin")] fn reserve_shared[T](_size: i32) -> &mut addrspace(3)[T]; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 283eee25..507f1db9 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -178,6 +178,7 @@ if(RUNTIME_JIT) ../platforms/${frontend}/intrinsics_nvvm.impala ../platforms/${frontend}/intrinsics_amdgpu.impala ../platforms/${frontend}/intrinsics_opencl.impala + ../platforms/${frontend}/intrinsics_spirv.impala ../platforms/${frontend}/intrinsics_thorin.impala ../platforms/${frontend}/intrinsics.impala ../platforms/${frontend}/runtime.impala) diff --git a/src/opencl_platform.cpp b/src/opencl_platform.cpp index 2b9d59ec..b26b3630 100644 --- a/src/opencl_platform.cpp +++ b/src/opencl_platform.cpp @@ -357,6 +357,12 @@ void time_kernel_callback(cl_event event, cl_int, void* data) { CHECK_OPENCL(err, "clReleaseEvent()"); } +static inline bool ends_with(std::string_view str, std::string_view suffix) { + if (str.size() < suffix.size()) + return false; + return str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0; +} + void OpenCLPlatform::launch_kernel(DeviceId dev, const LaunchParams& launch_params) { if (devices_[dev].is_intel_fpga && launch_params.num_args == 0) { debug("processing by autorun kernel"); @@ -364,18 +370,19 @@ void OpenCLPlatform::launch_kernel(DeviceId dev, const LaunchParams& launch_para } auto kernel = load_kernel(dev, launch_params.file_name, launch_params.kernel_name); + bool is_spirv = ends_with(launch_params.file_name, ".spv"); // set up arguments - std::vector kernel_structs(launch_params.num_args); + std::vector kernel_structs; for (uint32_t i = 0; i < launch_params.num_args; i++) { - if (launch_params.args.types[i] == KernelArgType::Struct) { + if (!is_spirv && launch_params.args.types[i] == KernelArgType::Struct) { // create a buffer for each structure argument cl_int err = CL_SUCCESS; cl_mem_flags flags = CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR; cl_mem struct_buf = clCreateBuffer(devices_[dev].ctx, flags, launch_params.args.sizes[i], launch_params.args.data[i], &err); CHECK_OPENCL(err, "clCreateBuffer()"); - kernel_structs[i] = struct_buf; - clSetKernelArg(kernel, i, sizeof(cl_mem), &kernel_structs[i]); + kernel_structs.push_back(struct_buf); + clSetKernelArg(kernel, i, sizeof(cl_mem), &struct_buf); } else { #ifdef CL_VERSION_2_0 if (launch_params.args.types[i] == KernelArgType::Ptr && devices_[dev].version_major == 2) { @@ -421,11 +428,9 @@ void OpenCLPlatform::launch_kernel(DeviceId dev, const LaunchParams& launch_para dynamic_profile(dev, launch_params.file_name); // release temporary buffers for struct arguments - for (uint32_t i = 0; i < launch_params.num_args; i++) { - if (launch_params.args.types[i] == KernelArgType::Struct) { - cl_int err = clReleaseMemObject(kernel_structs[i]); - CHECK_OPENCL(err, "clReleaseMemObject()"); - } + for (auto tmp : kernel_structs) { + cl_int err = clReleaseMemObject(tmp); + CHECK_OPENCL(err, "clReleaseMemObject()"); } } @@ -515,6 +520,21 @@ cl_program OpenCLPlatform::load_program_binary(DeviceId dev, const std::string& return program; } +cl_program OpenCLPlatform::load_program_il(DeviceId dev, const std::string& filename, const std::string& program_string) const { +#if CL_VERSION_2_1 + const size_t program_length = program_string.length(); + const char* program_c_str = program_string.c_str(); + cl_int err = CL_SUCCESS; + cl_program program = clCreateProgramWithIL(devices_[dev].ctx, (const void*)program_c_str, program_length, &err); + CHECK_OPENCL(err, "clCreateProgramWithIL()"); + debug("Loading IL '%' for OpenCL device %", filename, dev); + + return program; +#else + error("OpenCL 2.1 or later is required for SPIR-V support."); +#endif +} + cl_program OpenCLPlatform::load_program_source(DeviceId dev, const std::string& filename, const std::string& program_string) const { const size_t program_length = program_string.length(); const char* program_c_str = program_string.c_str(); @@ -589,25 +609,28 @@ cl_kernel OpenCLPlatform::load_kernel(DeviceId dev, const std::string& filename, if (prog_it == prog_cache.end()) { opencl_dev.unlock(); - if (canonical.extension() != ".cl") - error("Incorrect extension for kernel file '%' (should be '.cl')", canonical.string()); - // load file from disk or cache auto src_path = canonical; if (opencl_dev.is_intel_fpga) src_path.replace_extension(".aocx"); std::string src_code = runtime_->load_file(src_path.string()); - // compile src or load from cache - std::string bin = opencl_dev.is_intel_fpga ? src_code : runtime_->load_from_cache(devices_[dev].platform_name + devices_[dev].device_name + src_code); - if (bin.empty()) { - program = load_program_source(dev, src_path.string(), src_code); + if (canonical.extension() == ".spv") { + program = load_program_il(dev, src_path.string(), src_code); program = compile_program(dev, program, src_path.string()); - runtime_->store_to_cache(devices_[dev].platform_name + devices_[dev].device_name + src_code, program_as_string(program)); - } else { - program = load_program_binary(dev, src_path.string(), bin); - program = compile_program(dev, program, src_path.string()); - } + } else if (canonical.extension() == ".cl") { + // compile src or load from cache + std::string bin = opencl_dev.is_intel_fpga ? src_code : runtime_->load_from_cache(devices_[dev].platform_name + devices_[dev].device_name + src_code); + if (bin.empty()) { + program = load_program_source(dev, src_path.string(), src_code); + program = compile_program(dev, program, src_path.string()); + runtime_->store_to_cache(devices_[dev].platform_name + devices_[dev].device_name + src_code, program_as_string(program)); + } else { + program = load_program_binary(dev, src_path.string(), bin); + program = compile_program(dev, program, src_path.string()); + } + } else + error("Incorrect extension for kernel file '%' (should be '.cl' or .'spv')", canonical.string()); opencl_dev.lock(); prog_cache[canonical.string()] = program; diff --git a/src/opencl_platform.h b/src/opencl_platform.h index 6f9d6c37..96b07f45 100644 --- a/src/opencl_platform.h +++ b/src/opencl_platform.h @@ -107,6 +107,7 @@ class OpenCLPlatform : public Platform { cl_kernel load_kernel(DeviceId dev, const std::string& filename, const std::string& kernelname); cl_program load_program_binary(DeviceId dev, const std::string& filename, const std::string& program_string) const; + cl_program load_program_il(DeviceId dev, const std::string& filename, const std::string& program_string) const; cl_program load_program_source(DeviceId dev, const std::string& filename, const std::string& program_string) const; cl_program compile_program(DeviceId dev, cl_program program, const std::string& filename) const;