intel · martygrant · Oct 10, 2024 · Jun 25, 2024 · Jun 25, 2024 · Jun 25, 2024
@@ -197,6 +197,46 @@ The second interface allows users to allocate device local memory at runtime.
 SYCLcompat provides this functionality through its kernel launch interface,
 `launch<function>`, defined in the following section.
 
+Finally, the following cuda backend specific functions are introduced in order
+to translate from the local memory pointers introduced above to `uint32_t` or
+`size_t` variables that contain a byte address to the local
+(local refers to`.shared` in nvptx) memory state space.
+
+``` c++
+namespace syclcompat::experimental {
+inline __SYCL_ALWAYS_INLINE uint32_t nvvm_get_smem_pointer(void *ptr);
+
+inline __SYCL_ALWAYS_INLINE size_t cvta_generic_to_shared(void *ptr);
+} // syclcompat::experimental
+```
+
+These variables can be used in inline ptx instructions that take address
+operands; see:
+https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#addresses-as-operands
+Such inline ptx instructions are commonly used in optimized libaries.
+A simplified example usage of the above functions is as follows:
+
+``` c++
+  half *data = syclcompat::local_mem<half[NUM_ELEMENTS]>();
+  // ...
+  // ...
+  T addr;
+  if constexpr (std::is_same_v<size_t, T>) {
+    addr = syclcompat::experimental::cvta_generic_to_shared(
+        reinterpret_cast<char *>(data) + (id % 8) * 16);
+  } else { // T == uint32_t
+    addr = syclcompat::experimental::nvvm_get_smem_pointer(
+        reinterpret_cast<char *>(data) + (id % 8) * 16);
+  }
+
+uint32_t fragment;
+#if defined(__NVPTX__)
+  asm volatile("ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%0}, [%1];\n"
+                : "=r"(fragment)
+                : "r"(addr));
+#endif
+```
+
 ### launch<function>
 
 SYCLcompat provides a kernel `launch` interface which accepts a function that

@@ -85,6 +85,26 @@ enum memcpy_direction {
   device_to_device,
   automatic
 };
+
+inline __SYCL_ALWAYS_INLINE uint32_t nvvm_get_smem_pointer(void *ptr) {
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+  return (intptr_t)(sycl::decorated_local_ptr<const void>::pointer)ptr;
+#else
+  throw sycl::runtime_error(
+      "nvvm_get_smem_pointer is only supported on Nvidia devices.",
+      PI_ERROR_INVALID_DEVICE);
+#endif
+}
+
+inline __SYCL_ALWAYS_INLINE size_t cvta_generic_to_shared(void *ptr) {
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+  return (size_t)(sycl::decorated_local_ptr<const void>::pointer)ptr;
+#else
+  throw sycl::runtime_error(
+      "cvta_generic_to_shared is only supported on Nvidia devices.",
+      PI_ERROR_INVALID_DEVICE);
+#endif
+}
 }
 
 enum class memory_region {

@@ -0,0 +1,70 @@
+// REQUIRES: cuda
+// RUN:  %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_75 -o %t.out
+// RUN:  %{run} %t.out
+#include <sycl/detail/core.hpp>
+#include <syclcompat.hpp>
+#include <syclcompat/memory.hpp>
+
+using namespace sycl;
+#define NUM_ELEMENTS 64
+
+template <class T> void test(queue stream) {
+  half *res = malloc_shared<half>(NUM_ELEMENTS, stream);
+
+  for (int i = 0; i < NUM_ELEMENTS; ++i) {
+    res[i] = 0.5;
+  }
+
+  sycl::nd_range<1> global_range{sycl::range{32}, sycl::range{32}};
+
+  stream
+      .submit([&](handler &h) {
+        h.parallel_for<T>(global_range, [=](nd_item<1> item) {
+          sycl::group work_group = item.get_group();
+          int id = item.get_global_linear_id();
+          half *data = syclcompat::local_mem<half[NUM_ELEMENTS]>();
+
+          data[id * 2] = id;
+          data[id * 2 + 1] = id + 0.5;
+
+          T addr;
+          if constexpr (std::is_same_v<size_t, T>) {
+            addr = syclcompat::experimental::cvta_generic_to_shared(
+                reinterpret_cast<char *>(data) + (id % 8) * 16);
+          } else { // T == uint32_t
+            addr = syclcompat::experimental::nvvm_get_smem_pointer(
+                reinterpret_cast<char *>(data) + (id % 8) * 16);
+          }
+
+          uint32_t fragment;
+
+#if defined(__NVPTX__)
+          asm volatile("ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%0}, [%1];\n"
+                       : "=r"(fragment)
+                       : "r"(addr));
+#endif
+          sycl::group_barrier(work_group);
+
+          half *data_ptr = reinterpret_cast<half *>(&fragment);
+          res[id * 2] = data_ptr[0];
+          res[id * 2 + 1] = data_ptr[1];
+        });
+      })
+      .wait();
+
+  for (int i = 0; i < NUM_ELEMENTS; i++) {
+    assert(res[i] == static_cast<half>(i / 2.0));
+  }
+
+  free(res, stream);
+};
+
+int main() {
+
+  queue stream{property::queue::in_order{}};
+  test<size_t>(stream);
+  test<uint32_t>(stream);
+
+  std::cout << "PASS" << std::endl;
+  return 0;
+}