Merge branch 'users/meinersbur/flang_runtime_split-headers' into users/meinersbur/flang_runtime_split-headers2

Meinersbur · Meinersbur · commit 752fdc3a1bb5 · 2024-11-15T19:24:23.000+01:00
diff --git a/flang/include/flang/Runtime/CUDA/common.h b/flang/include/flang/Runtime/CUDA/common.h
@@ -12,6 +12,13 @@
 #include "flang/Runtime/descriptor-consts.h"
 #include "flang/Runtime/entry-names.h"
 
+/// Type of memory for allocation/deallocation
+static constexpr unsigned kMemTypeDevice = 0;
+static constexpr unsigned kMemTypeManaged = 1;
+static constexpr unsigned kMemTypeUnified = 2;
+static constexpr unsigned kMemTypePinned = 3;
+
+/// Data transfer kinds.
 static constexpr unsigned kHostToDevice = 0;
 static constexpr unsigned kDeviceToHost = 1;
 static constexpr unsigned kDeviceToDevice = 2;
diff --git a/flang/include/flang/Runtime/CUDA/memory.h b/flang/include/flang/Runtime/CUDA/memory.h
@@ -17,12 +17,24 @@ namespace Fortran::runtime::cuda {
 
 extern "C" {
 
+/// Allocate memory on the device.
+void *RTDECL(CUFMemAlloc)(std::size_t bytes, unsigned type,
+    const char *sourceFile = nullptr, int sourceLine = 0);
+
+/// Free memory allocated on the device.
+void RTDECL(CUFMemFree)(void *devicePtr, unsigned type,
+    const char *sourceFile = nullptr, int sourceLine = 0);
+
 /// Set value to the data hold by a descriptor. The \p value pointer must be
 /// addressable to the same amount of bytes specified by the element size of
 /// the descriptor \p desc.
 void RTDECL(CUFMemsetDescriptor)(const Descriptor &desc, void *value,
     const char *sourceFile = nullptr, int sourceLine = 0);
 
+/// Data transfer from a pointer to a pointer.
+void RTDECL(CUFDataTransferPtrPtr)(void *dst, void *src, std::size_t bytes,
+    unsigned mode, const char *sourceFile = nullptr, int sourceLine = 0);
+
 /// Data transfer from a pointer to a descriptor.
 void RTDECL(CUFDataTransferDescPtr)(const Descriptor &dst, void *src,
     std::size_t bytes, unsigned mode, const char *sourceFile = nullptr,
diff --git a/flang/runtime/CUDA/memory.cpp b/flang/runtime/CUDA/memory.cpp
@@ -8,19 +8,71 @@
 
 #include "flang/Runtime/CUDA/memory.h"
 #include "../terminator.h"
+#include "flang/Runtime/CUDA/common.h"
 
 #include "cuda_runtime.h"
 
 namespace Fortran::runtime::cuda {
 extern "C" {
 
+void *RTDEF(CUFMemAlloc)(
+    std::size_t bytes, unsigned type, const char *sourceFile, int sourceLine) {
+  void *ptr = nullptr;
+  if (bytes != 0) {
+    if (type == kMemTypeDevice) {
+      CUDA_REPORT_IF_ERROR(cudaMalloc((void **)&ptr, bytes));
+    } else if (type == kMemTypeManaged || type == kMemTypeUnified) {
+      CUDA_REPORT_IF_ERROR(
+          cudaMallocManaged((void **)&ptr, bytes, cudaMemAttachGlobal));
+    } else if (type == kMemTypePinned) {
+      CUDA_REPORT_IF_ERROR(cudaMallocHost((void **)&ptr, bytes));
+    } else {
+      Terminator terminator{sourceFile, sourceLine};
+      terminator.Crash("unsupported memory type");
+    }
+  }
+  return ptr;
+}
+
+void RTDEF(CUFMemFree)(
+    void *ptr, unsigned type, const char *sourceFile, int sourceLine) {
+  if (!ptr)
+    return;
+  if (type == kMemTypeDevice || type == kMemTypeManaged ||
+      type == kMemTypeUnified) {
+    CUDA_REPORT_IF_ERROR(cudaFree(ptr));
+  } else if (type == kMemTypePinned) {
+    CUDA_REPORT_IF_ERROR(cudaFreeHost(ptr));
+  } else {
+    Terminator terminator{sourceFile, sourceLine};
+    terminator.Crash("unsupported memory type");
+  }
+}
+
 void RTDEF(CUFMemsetDescriptor)(const Descriptor &desc, void *value,
     const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   terminator.Crash("not yet implemented: CUDA data transfer from a scalar "
                    "value to a descriptor");
 }
 
+void RTDEF(CUFDataTransferPtrPtr)(void *dst, void *src, std::size_t bytes,
+    unsigned mode, const char *sourceFile, int sourceLine) {
+  cudaMemcpyKind kind;
+  if (mode == kHostToDevice) {
+    kind = cudaMemcpyHostToDevice;
+  } else if (mode == kDeviceToHost) {
+    kind = cudaMemcpyDeviceToHost;
+  } else if (mode == kDeviceToDevice) {
+    kind = cudaMemcpyDeviceToDevice;
+  } else {
+    Terminator terminator{sourceFile, sourceLine};
+    terminator.Crash("host to host copy not supported");
+  }
+  // TODO: Use cudaMemcpyAsync when we have support for stream.
+  CUDA_REPORT_IF_ERROR(cudaMemcpy(dst, src, bytes, kind));
+}
+
 void RTDEF(CUFDataTransferDescPtr)(const Descriptor &desc, void *addr,
     std::size_t bytes, unsigned mode, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
diff --git a/flang/unittests/Runtime/CUDA/CMakeLists.txt b/flang/unittests/Runtime/CUDA/CMakeLists.txt
@@ -3,6 +3,7 @@ if (FLANG_CUF_RUNTIME)
 add_flang_unittest(FlangCufRuntimeTests
   Allocatable.cpp
   AllocatorCUF.cpp
+  Memory.cpp
 )
 
 if (BUILD_SHARED_LIBS)
diff --git a/flang/unittests/Runtime/CUDA/Memory.cpp b/flang/unittests/Runtime/CUDA/Memory.cpp
@@ -0,0 +1,31 @@
+//===-- flang/unittests/Runtime/Memory.cpp -----------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Runtime/CUDA/memory.h"
+#include "gtest/gtest.h"
+#include "../../../runtime/terminator.h"
+#include "flang/Common/Fortran.h"
+#include "flang/Runtime/CUDA/common.h"
+
+#include "cuda_runtime.h"
+
+using namespace Fortran::runtime::cuda;
+
+TEST(MemoryCUFTest, SimpleAllocTramsferFree) {
+  int *dev = (int *)RTNAME(CUFMemAlloc)(
+      sizeof(int), kMemTypeDevice, __FILE__, __LINE__);
+  EXPECT_TRUE(dev != 0);
+  int host = 42;
+  RTNAME(CUFDataTransferPtrPtr)
+  ((void *)dev, (void *)&host, sizeof(int), kHostToDevice, __FILE__, __LINE__);
+  host = 0;
+  RTNAME(CUFDataTransferPtrPtr)
+  ((void *)&host, (void *)dev, sizeof(int), kDeviceToHost, __FILE__, __LINE__);
+  EXPECT_EQ(42, host);
+  RTNAME(CUFMemFree)((void *)dev, kMemTypeDevice, __FILE__, __LINE__);
+}

Original file line number	Diff line number	Diff line change
`@@ -3,6 +3,7 @@ if (FLANG_CUF_RUNTIME)`
`3`	`3`	`add_flang_unittest(FlangCufRuntimeTests`
`4`	`4`	`Allocatable.cpp`
`5`	`5`	`AllocatorCUF.cpp`
	`6`	`+ Memory.cpp`
`6`	`7`	`)`
`7`	`8`
`8`	`9`	`if (BUILD_SHARED_LIBS)`