diff --git a/platforms/artic/runtime.impala b/platforms/artic/runtime.impala index 456a9412..197c6fe8 100644 --- a/platforms/artic/runtime.impala +++ b/platforms/artic/runtime.impala @@ -2,14 +2,16 @@ #[import(cc = "C", name = "anydsl_device_name")] fn runtime_device_name(_device: i32) -> &[u8]; #[import(cc = "C", name = "anydsl_device_check_feature_support")] fn runtime_device_check_feature_support(_device: i32, _feature: &[u8]) -> bool; -#[import(cc = "C", name = "anydsl_alloc")] fn runtime_alloc(_device: i32, _size: i64) -> &mut [i8]; -#[import(cc = "C", name = "anydsl_alloc_host")] fn runtime_alloc_host(_device: i32, _size: i64) -> &mut [i8]; -#[import(cc = "C", name = "anydsl_alloc_unified")] fn runtime_alloc_unified(_device: i32, _size: i64) -> &mut [i8]; -#[import(cc = "C", name = "anydsl_copy")] fn runtime_copy(_src_device: i32, _src_ptr: &[i8], _src_offset: i64, _dst_device: i32, _dst_ptr: &mut [i8], _dst_offset: i64, _size: i64) -> (); -#[import(cc = "C", name = "anydsl_get_device_ptr")] fn runtime_get_device_ptr(_device: i32, _ptr: &[i8]) -> &[i8]; -#[import(cc = "C", name = "anydsl_synchronize")] fn runtime_synchronize(_device: i32) -> (); -#[import(cc = "C", name = "anydsl_release")] fn runtime_release(_device: i32, _ptr: &[i8]) -> (); -#[import(cc = "C", name = "anydsl_release_host")] fn runtime_release_host(_device: i32, _ptr: &[i8]) -> (); +#[import(cc = "C", name = "anydsl_alloc")] fn runtime_alloc(_device: i32, _size: i64) -> &mut [i8]; +#[import(cc = "C", name = "anydsl_alloc_host")] fn runtime_alloc_host(_device: i32, _size: i64) -> &mut [i8]; +#[import(cc = "C", name = "anydsl_alloc_unified")] fn runtime_alloc_unified(_device: i32, _size: i64) -> &mut [i8]; +#[import(cc = "C", name = "anydsl_copy")] fn runtime_copy(_src_device: i32, _src_ptr: &[i8], _src_offset: i64, _dst_device: i32, _dst_ptr: &mut [i8], _dst_offset: i64, _size: i64) -> (); +#[import(cc = "C", name = "anydsl_get_device_ptr")] fn runtime_get_device_ptr(_device: i32, _ptr: &[i8]) -> &[i8]; +#[import(cc = "C", name = "anydsl_synchronize")] fn runtime_synchronize(_device: i32) -> (); +#[import(cc = "C", name = "anydsl_release")] fn runtime_release(_device: i32, _ptr: &[i8]) -> (); +#[import(cc = "C", name = "anydsl_release_host")] fn runtime_release_host(_device: i32, _ptr: &[i8]) -> (); +#[import(cc = "C", name = "anydsl_map_buffer_svm")] fn runtime_anydsl_map_buffer_svm(_device: i32, _ptr: &mut [i8], _size: i64) -> (); +#[import(cc = "C", name = "anydsl_unmap_buffer_svm")] fn runtime_anydsl_unmap_buffer_svm(_device: i32, _ptr: &mut [i8]) -> (); #[import(cc = "C", name = "anydsl_random_seed")] fn random_seed(_: u32) -> (); #[import(cc = "C", name = "anydsl_random_val_f32")] fn random_val_f32() -> f32; diff --git a/src/anydsl_runtime.cpp b/src/anydsl_runtime.cpp index d7582496..3be2d0a7 100644 --- a/src/anydsl_runtime.cpp +++ b/src/anydsl_runtime.cpp @@ -108,6 +108,14 @@ void anydsl_release_host(int32_t mask, void* ptr) { runtime().release_host(to_platform(mask), to_device(mask), ptr); } +void anydsl_map_buffer_svm(int32_t mask, void* ptr, int64_t size) { + runtime().map_buffer_svm(to_platform(mask), to_device(mask), ptr, size); +} + +void anydsl_unmap_buffer_svm(int32_t mask, void* ptr) { + runtime().unmap_buffer_svm(to_platform(mask), to_device(mask), ptr); +} + void anydsl_copy( int32_t mask_src, const void* src, int64_t offset_src, int32_t mask_dst, void* dst, int64_t offset_dst, int64_t size) { diff --git a/src/anydsl_runtime.h b/src/anydsl_runtime.h index 80bc6bed..f2feb263 100644 --- a/src/anydsl_runtime.h +++ b/src/anydsl_runtime.h @@ -33,6 +33,9 @@ AnyDSL_runtime_API void* anydsl_get_device_ptr(int32_t, void*); AnyDSL_runtime_API void anydsl_release(int32_t, void*); AnyDSL_runtime_API void anydsl_release_host(int32_t, void*); +AnyDSL_runtime_API void anydsl_map_buffer_svm(int32_t, void*, int64_t size); +AnyDSL_runtime_API void anydsl_unmap_buffer_svm(int32_t, void*); + AnyDSL_runtime_API void anydsl_copy(int32_t, const void*, int64_t, int32_t, void*, int64_t, int64_t); AnyDSL_runtime_API void anydsl_launch_kernel( diff --git a/src/opencl_platform.cpp b/src/opencl_platform.cpp index b26b3630..f70d19e3 100644 --- a/src/opencl_platform.cpp +++ b/src/opencl_platform.cpp @@ -202,6 +202,9 @@ OpenCLPlatform::OpenCLPlatform(Runtime* runtime) devices_.emplace_back(this, platform, device, version_major, version_minor, platform_name, device_name); #ifdef CL_VERSION_2_0 + devices_[dev].use_svm = svm_caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER; + if (getenv("ANYDSL_CL_DISABLE_SVM")) + devices_[dev].use_svm = false; devices_[dev].svm_caps = svm_caps; #endif @@ -297,7 +300,7 @@ void* OpenCLPlatform::alloc(DeviceId dev, int64_t size) { if (!size) return nullptr; #ifdef CL_VERSION_2_0 - if (devices_[dev].version_major == 2) { + if (devices_[dev].use_svm) { cl_mem_flags flags = CL_MEM_READ_WRITE; void* mem = clSVMAlloc(devices_[dev].ctx, flags, size, 0); if (mem == nullptr) @@ -318,7 +321,7 @@ void* OpenCLPlatform::alloc_unified(DeviceId dev, int64_t size) { if (!size) return nullptr; #ifdef CL_VERSION_2_0 - if (devices_[dev].version_major == 2) { + if (devices_[dev].use_svm) { cl_mem_flags flags = CL_MEM_READ_WRITE; if (devices_[dev].svm_caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) flags |= CL_MEM_SVM_FINE_GRAIN_BUFFER; @@ -336,7 +339,7 @@ void* OpenCLPlatform::alloc_unified(DeviceId dev, int64_t size) { void OpenCLPlatform::release(DeviceId dev, void* ptr) { #ifdef CL_VERSION_2_0 - if (devices_[dev].version_major == 2) + if (devices_[dev].use_svm) return clSVMFree(devices_[dev].ctx, ptr); #endif unused(dev); @@ -344,6 +347,26 @@ void OpenCLPlatform::release(DeviceId dev, void* ptr) { CHECK_OPENCL(err, "clReleaseMemObject()"); } +void OpenCLPlatform::map_buffer_svm(DeviceId dev, void* ptr, int64_t size) { + #ifdef CL_VERSION_2_0 + if (devices_[dev].use_svm) { + clEnqueueSVMMap(devices_[dev].queue, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, ptr, size, 0, nullptr, nullptr); + } + #else + error("Coarse-grained SVM is not supported on OpenCL device %d", dev); + #endif +} + +void OpenCLPlatform::unmap_buffer_svm(DeviceId dev, void* ptr) { + #ifdef CL_VERSION_2_0 + if (devices_[dev].use_svm) { + clEnqueueSVMUnmap(devices_[dev].queue, ptr, 0, nullptr, nullptr); + } + #else + error("Coarse-grained SVM is not supported on OpenCL device %d", dev); + #endif +} + void time_kernel_callback(cl_event event, cl_int, void* data) { auto dev = reinterpret_cast(data); cl_ulong end, start; @@ -382,10 +405,10 @@ void OpenCLPlatform::launch_kernel(DeviceId dev, const LaunchParams& launch_para cl_mem struct_buf = clCreateBuffer(devices_[dev].ctx, flags, launch_params.args.sizes[i], launch_params.args.data[i], &err); CHECK_OPENCL(err, "clCreateBuffer()"); kernel_structs.push_back(struct_buf); - clSetKernelArg(kernel, i, sizeof(cl_mem), &struct_buf); + CHECK_OPENCL(clSetKernelArg(kernel, i, sizeof(cl_mem), &struct_buf), "clSetKernelArg"); } else { #ifdef CL_VERSION_2_0 - if (launch_params.args.types[i] == KernelArgType::Ptr && devices_[dev].version_major == 2) { + if (launch_params.args.types[i] == KernelArgType::Ptr && devices_[dev].use_svm) { cl_int err = clSetKernelArgSVMPointer(kernel, i, *(void**)launch_params.args.data[i]); CHECK_OPENCL(err, "clSetKernelArgSVMPointer()"); continue; @@ -455,11 +478,12 @@ void OpenCLPlatform::copy(DeviceId dev_src, const void* src, int64_t offset_src, unused(dev_dst); #ifdef CL_VERSION_2_0 - if (devices_[dev_src].version_major == 2 && devices_[dev_dst].version_major == 2) + if (devices_[dev_src].use_svm && devices_[dev_dst].use_svm) return copy_svm(src, offset_src, dst, offset_dst, size); - if ((devices_[dev_src].version_major == 2 && devices_[dev_dst].version_major == 1) || - (devices_[dev_src].version_major == 1 && devices_[dev_dst].version_major == 2)) + if ((devices_[dev_src].use_svm != devices_[dev_dst].use_svm)) error("copy between SVM and non-SVM OpenCL devices % and %", dev_src, dev_dst); + if (devices_[dev_src].use_svm) + return copy_svm(src, offset_src, dst, offset_dst, size); #endif cl_int err = clEnqueueCopyBuffer(devices_[dev_src].queue, (cl_mem)src, (cl_mem)dst, offset_src, offset_dst, size, 0, NULL, NULL); @@ -469,8 +493,16 @@ void OpenCLPlatform::copy(DeviceId dev_src, const void* src, int64_t offset_src, void OpenCLPlatform::copy_from_host(const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) { #ifdef CL_VERSION_2_0 - if (devices_[dev_dst].version_major == 2) - return copy_svm(src, offset_src, dst, offset_dst, size); + if (devices_[dev_dst].use_svm) { + if (!(devices_[dev_dst].svm_caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER)) { + map_buffer_svm(dev_dst, const_cast(dst), size); + copy_svm(src, offset_src, dst, offset_dst, size); + unmap_buffer_svm(dev_dst, const_cast(dst)); + //copy_svm_device(dev_dst, src, offset_src, dst, offset_dst, size); + } else + copy_svm(src, offset_src, dst, offset_dst, size); + return; + } #endif cl_int err = clEnqueueWriteBuffer(devices_[dev_dst].queue, (cl_mem)dst, CL_FALSE, offset_dst, size, (char*)src + offset_src, 0, NULL, NULL); err |= clFinish(devices_[dev_dst].queue); @@ -479,14 +511,27 @@ void OpenCLPlatform::copy_from_host(const void* src, int64_t offset_src, DeviceI void OpenCLPlatform::copy_to_host(DeviceId dev_src, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size) { #ifdef CL_VERSION_2_0 - if (devices_[dev_src].version_major == 2) - return copy_svm(src, offset_src, dst, offset_dst, size); + if (devices_[dev_src].use_svm) { + if (!(devices_[dev_src].svm_caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER)) { + //map_buffer_svm(dev_src, const_cast(src), size); + //copy_svm(src, offset_src, dst, offset_dst, size); + //unmap_buffer_svm(dev_src, const_cast(src)); + copy_svm_device(dev_src, src, offset_src, dst, offset_dst, size); + } + else + copy_svm(src, offset_src, dst, offset_dst, size); + return; + } #endif cl_int err = clEnqueueReadBuffer(devices_[dev_src].queue, (cl_mem)src, CL_FALSE, offset_src, size, (char*)dst + offset_dst, 0, NULL, NULL); err |= clFinish(devices_[dev_src].queue); CHECK_OPENCL(err, "clEnqueueReadBuffer()"); } +void OpenCLPlatform::copy_svm_device(DeviceId dev, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size) { + clEnqueueSVMMemcpy(devices_[dev].queue, true, (char*)dst + offset_dst, (char*)src + offset_src, size, 0, nullptr, nullptr); +} + void OpenCLPlatform::copy_svm(const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size) { std::copy((char*)src + offset_src, (char*)src + offset_src + size, (char*)dst + offset_dst); } @@ -552,12 +597,12 @@ cl_program OpenCLPlatform::compile_program(DeviceId dev, cl_program program, con options += " -cl-std=CL" + std::to_string(devices_[dev].version_major) + "." + std::to_string(devices_[dev].version_minor); cl_build_status build_status; - cl_int err = clBuildProgram(program, 0, NULL, options.c_str(), NULL, NULL); + cl_int err = clBuildProgram(program, 1, &devices_[dev].dev, options.c_str(), NULL, NULL); err |= clGetProgramBuildInfo(program, devices_[dev].dev, CL_PROGRAM_BUILD_STATUS, sizeof(build_status), &build_status, NULL); if (build_status == CL_BUILD_ERROR || err != CL_SUCCESS) { // determine the size of the options and log - size_t log_size, options_size; + size_t log_size = 0, options_size = 0; err |= clGetProgramBuildInfo(program, devices_[dev].dev, CL_PROGRAM_BUILD_OPTIONS, 0, NULL, &options_size); err |= clGetProgramBuildInfo(program, devices_[dev].dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); diff --git a/src/opencl_platform.h b/src/opencl_platform.h index 96b07f45..79622131 100644 --- a/src/opencl_platform.h +++ b/src/opencl_platform.h @@ -31,12 +31,16 @@ class OpenCLPlatform : public Platform { void release(DeviceId dev, void* ptr) override; void release_host(DeviceId, void*) override { command_unavailable("release_host"); } + void map_buffer_svm(DeviceId, void*, int64_t) override; + void unmap_buffer_svm(DeviceId, void*) override; + void launch_kernel(DeviceId dev, const LaunchParams& launch_params) override; void synchronize(DeviceId dev) override; void copy(DeviceId dev_src, const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) override; void copy_from_host(const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) override; void copy_to_host(DeviceId dev_src, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size) override; + void copy_svm_device(DeviceId dev, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size); void copy_svm(const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size); void dynamic_profile(DeviceId dev, const std::string& filename); @@ -58,6 +62,7 @@ class OpenCLPlatform : public Platform { cl_command_queue queue = nullptr; cl_context ctx = nullptr; #ifdef CL_VERSION_2_0 + bool use_svm = false; cl_device_svm_capabilities svm_caps; #endif bool is_intel_fpga = false; diff --git a/src/platform.h b/src/platform.h index b8d719a4..a81f4d01 100644 --- a/src/platform.h +++ b/src/platform.h @@ -37,6 +37,10 @@ class Platform { virtual void release(DeviceId dev, void* ptr) = 0; /// Releases page-locked host memory for a device on this platform. virtual void release_host(DeviceId dev, void* ptr) = 0; + /// Map a coarse-grained SVM buffer for host access + virtual void map_buffer_svm(DeviceId dev, void* ptr, int64_t size) { command_unavailable("map_buffer_svm"); }; + /// Unmap a coarse-grained SVM buffer for device access + virtual void unmap_buffer_svm(DeviceId dev, void* ptr) { command_unavailable("unmap_buffer_svm"); }; /// Launches a kernel with the given block/grid size and arguments. virtual void launch_kernel(DeviceId dev, const LaunchParams& launch_params) = 0; diff --git a/src/runtime.cpp b/src/runtime.cpp index a698fa84..9f871c0e 100644 --- a/src/runtime.cpp +++ b/src/runtime.cpp @@ -78,6 +78,16 @@ void Runtime::release_host(PlatformId plat, DeviceId dev, void* ptr) { platforms_[plat]->release_host(dev, ptr); } +void Runtime::map_buffer_svm(PlatformId plat, DeviceId dev, void* ptr, int64_t size) { + check_device(plat, dev); + platforms_[plat]->map_buffer_svm(dev, ptr, size); +} + +void Runtime::unmap_buffer_svm(PlatformId plat, DeviceId dev, void* ptr) { + check_device(plat, dev); + platforms_[plat]->unmap_buffer_svm(dev, ptr); +} + void Runtime::copy( PlatformId plat_src, DeviceId dev_src, const void* src, int64_t offset_src, PlatformId plat_dst, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) { diff --git a/src/runtime.h b/src/runtime.h index a9af9891..d14b2e35 100644 --- a/src/runtime.h +++ b/src/runtime.h @@ -69,6 +69,10 @@ class Runtime { void release(PlatformId plat, DeviceId dev, void* ptr); /// Releases previously allocated page-locked memory. void release_host(PlatformId plat, DeviceId dev, void* ptr); + /// Map a coarse-grained SVM buffer for host access + void map_buffer_svm(PlatformId plat, DeviceId dev, void* ptr, int64_t size); + /// Unmap a coarse-grained SVM buffer for device access + void unmap_buffer_svm(PlatformId plat, DeviceId dev, void* ptr); /// Copies memory between devices. void copy( PlatformId plat_src, DeviceId dev_src, const void* src, int64_t offset_src,