Skip to content
Merged
35 changes: 35 additions & 0 deletions sycl/doc/syclcompat/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -855,6 +855,41 @@ public:
} // syclcompat
```

### ptr_to_int

The following cuda backend specific function is introduced in order to
translate from local memory pointers to `uint32_t` or `size_t` variables that
contain a byte address to the local (local refers to`.shared` in nvptx) memory
state space.

``` c++
namespace syclcompat {
template <typename T>
__syclcompat_inline__
std::enable_if_t<std::is_same_v<T, uint32_t> || std::is_same_v<T, size_t>,
T>
ptr_to_int(void *ptr)
} // namespace syclcompat
```

These variables can be used in inline PTX instructions that take address
operands. Such inline PTX instructions are commonly used in optimized
libraries. A simplified example usage of the above functions is as follows:

``` c++
half *data = syclcompat::local_mem<half[NUM_ELEMENTS]>();
// ...
// ...
T addr =
syclcompat::ptr_to_int<T>(reinterpret_cast<char *>(data) + (id % 8) * 16);
uint32_t fragment;
#if defined(__NVPTX__)
asm volatile("ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%0}, [%1];\n"
: "=r"(fragment)
: "r"(addr));
#endif
```

### Device Information

`sycl::device` properties are encapsulated using the `device_info` helper class.
Expand Down
18 changes: 18 additions & 0 deletions sycl/include/syclcompat/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@

#include <syclcompat/device.hpp>
#include <syclcompat/traits.hpp>
#include <syclcompat/defs.hpp>

#if defined(__linux__)
#include <sys/mman.h>
Expand Down Expand Up @@ -86,6 +87,23 @@ enum memcpy_direction {
};
}

template <typename T>
__syclcompat_inline__
std::enable_if_t<std::is_same_v<T, uint32_t> || std::is_same_v<T, size_t>,
T>
ptr_to_int(void *ptr) {
#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
if constexpr (std::is_same_v<T, uint32_t>) {
return (intptr_t)(sycl::decorated_local_ptr<const void>::pointer)ptr;
} else {
return (size_t)(sycl::decorated_local_ptr<const void>::pointer)ptr;
}
#else
throw sycl::exception(make_error_code(sycl::errc::runtime),
"ptr_to_int is only supported on Nvidia devices.");
#endif
}

enum class memory_region {
global = 0, // device global memory
constant, // device read-only memory
Expand Down
63 changes: 63 additions & 0 deletions sycl/test-e2e/syclcompat/memory/local_memory_ptr_to_integer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
// REQUIRES: cuda
// RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_75 -o %t.out
// RUN: %{run} %t.out
#include <sycl/detail/core.hpp>
#include <sycl/group_barrier.hpp>
#include <syclcompat/memory.hpp>

using namespace sycl;
#define NUM_ELEMENTS 64

template <class T> void test(queue stream) {
half *res = malloc_shared<half>(NUM_ELEMENTS, stream);

for (int i = 0; i < NUM_ELEMENTS; ++i) {
res[i] = 0.5;
}

sycl::nd_range<1> global_range{sycl::range{32}, sycl::range{32}};

stream
.submit([&](handler &h) {
h.parallel_for<T>(global_range, [=](nd_item<1> item) {
sycl::group work_group = item.get_group();
int id = item.get_global_linear_id();
half *data = syclcompat::local_mem<half[NUM_ELEMENTS]>();

data[id * 2] = id;
data[id * 2 + 1] = id + 0.5;

T addr =
syclcompat::ptr_to_int<T>(reinterpret_cast<char *>(data) + (id % 8) * 16);

uint32_t fragment;
#if defined(__NVPTX__)
asm volatile("ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%0}, [%1];\n"
: "=r"(fragment)
: "r"(addr));
#endif
sycl::group_barrier(work_group);

half *data_ptr = reinterpret_cast<half *>(&fragment);
res[id * 2] = data_ptr[0];
res[id * 2 + 1] = data_ptr[1];
});
})
.wait();

for (int i = 0; i < NUM_ELEMENTS; i++) {
assert(res[i] == static_cast<half>(i / 2.0));
}

free(res, stream);
};

int main() {

queue stream{property::queue::in_order{}};
test<size_t>(stream);
test<uint32_t>(stream);

std::cout << "PASS" << std::endl;
return 0;
}
Loading