Skip to content
Merged
36 changes: 36 additions & 0 deletions sycl/doc/syclcompat/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -947,6 +947,42 @@ public:
};
```

### ptr_to_int

The following cuda backend specific function is introduced in order to
translate from local memory pointers to `uint32_t` or `size_t` variables that
contain a byte address to the local (local refers to`.shared` in nvptx) memory
state space.

``` c++
namespace syclcompat {
template <typename T>
__syclcompat_inline__
std::enable_if_t<std::is_same_v<T, uint32_t> || std::is_same_v<T, size_t>,
T>
ptr_to_int(void *ptr)
} // syclcompat
```

These variables can be used in inline PTX instructions that take address
operands. Such inline PTX instructions are commonly used in optimized
libraries. A simplified example usage of the above functions is as follows:

``` c++
half *data = syclcompat::local_mem<half[NUM_ELEMENTS]>();
// ...
// ...
T addr =
syclcompat::ptr_to_int<T>(reinterpret_cast<char *>(data) + (id % 8) * 16);

uint32_t fragment;
#if defined(__NVPTX__)
asm volatile("ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%0}, [%1];\n"
: "=r"(fragment)
: "r"(addr));
#endif
```
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you fix the formatting of this code section? Thanks

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did clang-format it already using dpc++ format.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Possibly it's not running on code sections in markdown? I'd expect uint32_t fragment to align with T addr on the line above? And the line split on lines 975-976 looks pretty wacky? If I dump this code into a cpp file and autoformat this, I get:

  half *data = syclcompat::local_mem<half[NUM_ELEMENTS]>();
  // ...
  // ...
  T addr =
      syclcompat::ptr_to_int<T>(reinterpret_cast<char *>(data) + (id % 8) * 16);
  uint32_t fragment;
#if defined(__NVPTX__)
  asm volatile("ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%0}, [%1];\n"
               : "=r"(fragment)
               : "r"(addr));
#endif

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can see if that passes clang-format (in the test where it is used). The existing version passes the clang-format on the clang-format CI.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think clang-format runs on the README tbh.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I use the same code in the test-e2e

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Possibly it's not running on code sections in markdown? I'd expect uint32_t fragment to align with T addr on the line above? And the line split on lines 975-976 looks pretty wacky? If I dump this code into a cpp file and autoformat this, I get:

  half *data = syclcompat::local_mem<half[NUM_ELEMENTS]>();
  // ...
  // ...
  T addr =
      syclcompat::ptr_to_int<T>(reinterpret_cast<char *>(data) + (id % 8) * 16);
  uint32_t fragment;
#if defined(__NVPTX__)
  asm volatile("ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%0}, [%1];\n"
               : "=r"(fragment)
               : "r"(addr));
#endif

I've updated the README with this suggestion now


### Device Management

Multiple SYCL functionalities are exposed through utility functions to manage
Expand Down
18 changes: 18 additions & 0 deletions sycl/include/syclcompat/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@

#include <syclcompat/device.hpp>
#include <syclcompat/traits.hpp>
#include <syclcompat/defs.hpp>

#if defined(__linux__)
#include <sys/mman.h>
Expand Down Expand Up @@ -86,6 +87,23 @@ enum memcpy_direction {
};
}

template <typename T>
__syclcompat_inline__
std::enable_if_t<std::is_same_v<T, uint32_t> || std::is_same_v<T, size_t>,
T>
ptr_to_int(void *ptr) {
#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
if constexpr (std::is_same_v<T, uint32_t>) {
return (intptr_t)(sycl::decorated_local_ptr<const void>::pointer)ptr;
} else {
return (size_t)(sycl::decorated_local_ptr<const void>::pointer)ptr;
}
#else
throw sycl::exception(make_error_code(sycl::errc::runtime),
"ptr_to_int is only supported on Nvidia devices.");
#endif
}

enum class memory_region {
global = 0, // device global memory
constant, // device read-only memory
Expand Down
63 changes: 63 additions & 0 deletions sycl/test-e2e/syclcompat/memory/local_memory_ptr_to_integer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
// REQUIRES: cuda
// RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_75 -o %t.out
// RUN: %{run} %t.out
#include <sycl/detail/core.hpp>
#include <sycl/group_barrier.hpp>
#include <syclcompat/memory.hpp>

using namespace sycl;
#define NUM_ELEMENTS 64

template <class T> void test(queue stream) {
half *res = malloc_shared<half>(NUM_ELEMENTS, stream);

for (int i = 0; i < NUM_ELEMENTS; ++i) {
res[i] = 0.5;
}

sycl::nd_range<1> global_range{sycl::range{32}, sycl::range{32}};

stream
.submit([&](handler &h) {
h.parallel_for<T>(global_range, [=](nd_item<1> item) {
sycl::group work_group = item.get_group();
int id = item.get_global_linear_id();
half *data = syclcompat::local_mem<half[NUM_ELEMENTS]>();

data[id * 2] = id;
data[id * 2 + 1] = id + 0.5;

T addr =
syclcompat::ptr_to_int<T>(reinterpret_cast<char *>(data) + (id % 8) * 16);

uint32_t fragment;
#if defined(__NVPTX__)
asm volatile("ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%0}, [%1];\n"
: "=r"(fragment)
: "r"(addr));
#endif
sycl::group_barrier(work_group);

half *data_ptr = reinterpret_cast<half *>(&fragment);
res[id * 2] = data_ptr[0];
res[id * 2 + 1] = data_ptr[1];
});
})
.wait();

for (int i = 0; i < NUM_ELEMENTS; i++) {
assert(res[i] == static_cast<half>(i / 2.0));
}

free(res, stream);
};

int main() {

queue stream{property::queue::in_order{}};
test<size_t>(stream);
test<uint32_t>(stream);

std::cout << "PASS" << std::endl;
return 0;
}
Loading