diff --git a/sycl/doc/syclcompat/README.md b/sycl/doc/syclcompat/README.md index 6dd8708afeb62..9e06250ecdef0 100644 --- a/sycl/doc/syclcompat/README.md +++ b/sycl/doc/syclcompat/README.md @@ -855,6 +855,41 @@ public: } // syclcompat ``` +### ptr_to_int + +The following cuda backend specific function is introduced in order to +translate from local memory pointers to `uint32_t` or `size_t` variables that +contain a byte address to the local (local refers to`.shared` in nvptx) memory +state space. + +``` c++ +namespace syclcompat { +template +__syclcompat_inline__ + std::enable_if_t || std::is_same_v, + T> + ptr_to_int(void *ptr) +} // namespace syclcompat +``` + +These variables can be used in inline PTX instructions that take address +operands. Such inline PTX instructions are commonly used in optimized +libraries. A simplified example usage of the above functions is as follows: + +``` c++ + half *data = syclcompat::local_mem(); + // ... + // ... + T addr = + syclcompat::ptr_to_int(reinterpret_cast(data) + (id % 8) * 16); + uint32_t fragment; +#if defined(__NVPTX__) + asm volatile("ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%0}, [%1];\n" + : "=r"(fragment) + : "r"(addr)); +#endif +``` + ### Device Information `sycl::device` properties are encapsulated using the `device_info` helper class. diff --git a/sycl/include/syclcompat/memory.hpp b/sycl/include/syclcompat/memory.hpp index 93dace8bb60d8..eb92d9bec51c6 100644 --- a/sycl/include/syclcompat/memory.hpp +++ b/sycl/include/syclcompat/memory.hpp @@ -53,6 +53,7 @@ #include #include +#include #if defined(__linux__) #include @@ -86,6 +87,23 @@ enum memcpy_direction { }; } +template +__syclcompat_inline__ + std::enable_if_t || std::is_same_v, + T> + ptr_to_int(void *ptr) { +#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) + if constexpr (std::is_same_v) { + return (intptr_t)(sycl::decorated_local_ptr::pointer)ptr; + } else { + return (size_t)(sycl::decorated_local_ptr::pointer)ptr; + } +#else + throw sycl::exception(make_error_code(sycl::errc::runtime), + "ptr_to_int is only supported on Nvidia devices."); +#endif +} + enum class memory_region { global = 0, // device global memory constant, // device read-only memory diff --git a/sycl/test-e2e/syclcompat/memory/local_memory_ptr_to_integer.cpp b/sycl/test-e2e/syclcompat/memory/local_memory_ptr_to_integer.cpp new file mode 100644 index 0000000000000..184f7b5835810 --- /dev/null +++ b/sycl/test-e2e/syclcompat/memory/local_memory_ptr_to_integer.cpp @@ -0,0 +1,63 @@ +// REQUIRES: cuda +// RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_75 -o %t.out +// RUN: %{run} %t.out +#include +#include +#include + +using namespace sycl; +#define NUM_ELEMENTS 64 + +template void test(queue stream) { + half *res = malloc_shared(NUM_ELEMENTS, stream); + + for (int i = 0; i < NUM_ELEMENTS; ++i) { + res[i] = 0.5; + } + + sycl::nd_range<1> global_range{sycl::range{32}, sycl::range{32}}; + + stream + .submit([&](handler &h) { + h.parallel_for(global_range, [=](nd_item<1> item) { + sycl::group work_group = item.get_group(); + int id = item.get_global_linear_id(); + half *data = syclcompat::local_mem(); + + data[id * 2] = id; + data[id * 2 + 1] = id + 0.5; + + T addr = + syclcompat::ptr_to_int(reinterpret_cast(data) + (id % 8) * 16); + + uint32_t fragment; +#if defined(__NVPTX__) + asm volatile("ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%0}, [%1];\n" + : "=r"(fragment) + : "r"(addr)); +#endif + sycl::group_barrier(work_group); + + half *data_ptr = reinterpret_cast(&fragment); + res[id * 2] = data_ptr[0]; + res[id * 2 + 1] = data_ptr[1]; + }); + }) + .wait(); + + for (int i = 0; i < NUM_ELEMENTS; i++) { + assert(res[i] == static_cast(i / 2.0)); + } + + free(res, stream); +}; + +int main() { + + queue stream{property::queue::in_order{}}; + test(stream); + test(stream); + + std::cout << "PASS" << std::endl; + return 0; +}