Skip to content
Merged
40 changes: 40 additions & 0 deletions sycl/doc/syclcompat/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,46 @@ The second interface allows users to allocate device local memory at runtime.
SYCLcompat provides this functionality through its kernel launch interface,
`launch<function>`, defined in the following section.

Finally, the following cuda backend specific functions are introduced in order
to translate from the local memory pointers introduced above to `uint32_t` or
`size_t` variables that contain a byte address to the local
(local refers to`.shared` in nvptx) memory state space.

``` c++
namespace syclcompat::experimental {
inline __SYCL_ALWAYS_INLINE uint32_t nvvm_get_smem_pointer(void *ptr);

inline __SYCL_ALWAYS_INLINE size_t cvta_generic_to_shared(void *ptr);
} // syclcompat::experimental
```

These variables can be used in inline ptx instructions that take address
operands; see:
https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#addresses-as-operands
Such inline ptx instructions are commonly used in optimized libaries.
A simplified example usage of the above functions is as follows:

``` c++
half *data = syclcompat::local_mem<half[NUM_ELEMENTS]>();
// ...
// ...
T addr;
if constexpr (std::is_same_v<size_t, T>) {
addr = syclcompat::experimental::cvta_generic_to_shared(
reinterpret_cast<char *>(data) + (id % 8) * 16);
} else { // T == uint32_t
addr = syclcompat::experimental::nvvm_get_smem_pointer(
reinterpret_cast<char *>(data) + (id % 8) * 16);
}

uint32_t fragment;
#if defined(__NVPTX__)
asm volatile("ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%0}, [%1];\n"
: "=r"(fragment)
: "r"(addr));
#endif
```

### launch<function>

SYCLcompat provides a kernel `launch` interface which accepts a function that
Expand Down
20 changes: 20 additions & 0 deletions sycl/include/syclcompat/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,26 @@ enum memcpy_direction {
device_to_device,
automatic
};

inline __SYCL_ALWAYS_INLINE uint32_t nvvm_get_smem_pointer(void *ptr) {
#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
return (intptr_t)(sycl::decorated_local_ptr<const void>::pointer)ptr;
#else
throw sycl::runtime_error(
"nvvm_get_smem_pointer is only supported on Nvidia devices.",
PI_ERROR_INVALID_DEVICE);
#endif
}

inline __SYCL_ALWAYS_INLINE size_t cvta_generic_to_shared(void *ptr) {
#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
return (size_t)(sycl::decorated_local_ptr<const void>::pointer)ptr;
#else
throw sycl::runtime_error(
"cvta_generic_to_shared is only supported on Nvidia devices.",
PI_ERROR_INVALID_DEVICE);
#endif
}
}

enum class memory_region {
Expand Down
70 changes: 70 additions & 0 deletions sycl/test-e2e/syclcompat/memory/local_memory_ptr_to_integer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
// REQUIRES: cuda
// RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_75 -o %t.out
// RUN: %{run} %t.out
#include <sycl/detail/core.hpp>
#include <syclcompat.hpp>
#include <syclcompat/memory.hpp>

using namespace sycl;
#define NUM_ELEMENTS 64

template <class T> void test(queue stream) {
half *res = malloc_shared<half>(NUM_ELEMENTS, stream);

for (int i = 0; i < NUM_ELEMENTS; ++i) {
res[i] = 0.5;
}

sycl::nd_range<1> global_range{sycl::range{32}, sycl::range{32}};

stream
.submit([&](handler &h) {
h.parallel_for<T>(global_range, [=](nd_item<1> item) {
sycl::group work_group = item.get_group();
int id = item.get_global_linear_id();
half *data = syclcompat::local_mem<half[NUM_ELEMENTS]>();

data[id * 2] = id;
data[id * 2 + 1] = id + 0.5;

T addr;
if constexpr (std::is_same_v<size_t, T>) {
addr = syclcompat::experimental::cvta_generic_to_shared(
reinterpret_cast<char *>(data) + (id % 8) * 16);
} else { // T == uint32_t
addr = syclcompat::experimental::nvvm_get_smem_pointer(
reinterpret_cast<char *>(data) + (id % 8) * 16);
}

uint32_t fragment;

#if defined(__NVPTX__)
asm volatile("ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%0}, [%1];\n"
: "=r"(fragment)
: "r"(addr));
#endif
sycl::group_barrier(work_group);

half *data_ptr = reinterpret_cast<half *>(&fragment);
res[id * 2] = data_ptr[0];
res[id * 2 + 1] = data_ptr[1];
});
})
.wait();

for (int i = 0; i < NUM_ELEMENTS; i++) {
assert(res[i] == static_cast<half>(i / 2.0));
}

free(res, stream);
};

int main() {

queue stream{property::queue::in_order{}};
test<size_t>(stream);
test<uint32_t>(stream);

std::cout << "PASS" << std::endl;
return 0;
}