[flang][cuda] Allow to set the stack limit size #124859

clementval · 2025-01-28T23:55:17Z

This patch adds a call to the CUFInit function just after ProgramStart when CUDA Fortran is enabled to initialize the CUDA context. This allows us to set up some context information like the stack limit that can be defined by an environment variable ACC_OFFLOAD_STACKSIZE=<value>.

llvmbot · 2025-01-28T23:55:49Z

@llvm/pr-subscribers-flang-fir-hlfir

@llvm/pr-subscribers-flang-runtime

Author: Valentin Clement (バレンタインクレメン) (clementval)

Changes

This patch adds a call to the CUFInit function just after ProgramStart when CUDA Fortran is enabled to initialize the CUDA context. This allows us to set up some context information like the stack limit that can be defined by an environment variable CUDA_STACKLIMIT=<value>.

Full diff: https://github.com/llvm/llvm-project/pull/124859.diff

9 Files Affected:

(modified) flang/CMakeLists.txt (+7-6)
(modified) flang/include/flang/Optimizer/Builder/Runtime/Main.h (+2-1)
(added) flang/include/flang/Runtime/CUDA/init.h (+20)
(modified) flang/lib/Lower/Bridge.cpp (+3-1)
(modified) flang/lib/Optimizer/Builder/Runtime/Main.cpp (+14-1)
(modified) flang/runtime/CUDA/CMakeLists.txt (+1)
(added) flang/runtime/CUDA/init.cpp (+25)
(modified) flang/runtime/environment.cpp (+11)
(modified) flang/runtime/environment.h (+3)

diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt
index b619553ef83021..fb7ab4759ad37e 100644
--- a/flang/CMakeLists.txt
+++ b/flang/CMakeLists.txt
@@ -471,6 +471,13 @@ if (FLANG_INCLUDE_TESTS)
   add_compile_definitions(FLANG_INCLUDE_TESTS=1)
 endif()
 
+option(FLANG_CUF_RUNTIME
+  "Compile CUDA Fortran runtime sources" OFF)
+if (FLANG_CUF_RUNTIME)
+  find_package(CUDAToolkit REQUIRED)
+  add_compile_definitions(FLANG_CUDA_SUPPORT=1)
+endif()
+
 add_subdirectory(include)
 add_subdirectory(lib)
 add_subdirectory(cmake/modules)
@@ -481,12 +488,6 @@ if (FLANG_BUILD_TOOLS)
   add_subdirectory(tools)
 endif()
 
-option(FLANG_CUF_RUNTIME
-  "Compile CUDA Fortran runtime sources" OFF)
-if (FLANG_CUF_RUNTIME)
-  find_package(CUDAToolkit REQUIRED)
-endif()
-
 add_subdirectory(runtime)
 
 if (LLVM_INCLUDE_EXAMPLES)
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Main.h b/flang/include/flang/Optimizer/Builder/Runtime/Main.h
index e4c5dc914c700b..a0586deade42aa 100644
--- a/flang/include/flang/Optimizer/Builder/Runtime/Main.h
+++ b/flang/include/flang/Optimizer/Builder/Runtime/Main.h
@@ -24,7 +24,8 @@ class GlobalOp;
 namespace fir::runtime {
 
 void genMain(fir::FirOpBuilder &builder, mlir::Location loc,
-             const std::vector<Fortran::lower::EnvironmentDefault> &defs);
+             const std::vector<Fortran::lower::EnvironmentDefault> &defs,
+             bool initCuda = false);
 }
 
 #endif // FORTRAN_OPTIMIZER_BUILDER_RUNTIME_MAIN_H
diff --git a/flang/include/flang/Runtime/CUDA/init.h b/flang/include/flang/Runtime/CUDA/init.h
new file mode 100644
index 00000000000000..24bc6838227208
--- /dev/null
+++ b/flang/include/flang/Runtime/CUDA/init.h
@@ -0,0 +1,20 @@
+//===-- include/flang/Runtime/CUDA/init.h -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_RUNTIME_CUDA_INIT_H_
+#define FORTRAN_RUNTIME_CUDA_INIT_H_
+
+#include "common.h"
+#include "flang/Runtime/entry-names.h"
+
+extern "C" {
+
+void RTDECL(CUFInit)();
+}
+
+#endif // FORTRAN_RUNTIME_CUDA_INIT_H_
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index d92dc0cf9abd62..ff80826216e4f5 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -459,7 +459,9 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     if (hasMainProgram)
       createGlobalOutsideOfFunctionLowering([&]() {
         fir::runtime::genMain(*builder, toLocation(),
-                              bridge.getEnvironmentDefaults());
+                              bridge.getEnvironmentDefaults(),
+                              getFoldingContext().languageFeatures().IsEnabled(
+                                  Fortran::common::LanguageFeature::CUDA));
       });
 
     finalizeOpenACCLowering();
diff --git a/flang/lib/Optimizer/Builder/Runtime/Main.cpp b/flang/lib/Optimizer/Builder/Runtime/Main.cpp
index ab3c4ca81314ce..5156fd54020777 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Main.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Main.cpp
@@ -16,13 +16,17 @@
 #include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Runtime/main.h"
 #include "flang/Runtime/stop.h"
+#ifdef FLANG_CUDA_SUPPORT
+#include "flang/Runtime/CUDA/init.h"
+#endif
 
 using namespace Fortran::runtime;
 
 /// Create a `int main(...)` that calls the Fortran entry point
 void fir::runtime::genMain(
     fir::FirOpBuilder &builder, mlir::Location loc,
-    const std::vector<Fortran::lower::EnvironmentDefault> &defs) {
+    const std::vector<Fortran::lower::EnvironmentDefault> &defs,
+    bool initCuda) {
   auto *context = builder.getContext();
   auto argcTy = builder.getDefaultIntegerType();
   auto ptrTy = mlir::LLVM::LLVMPointerType::get(context);
@@ -61,6 +65,15 @@ void fir::runtime::genMain(
   args.push_back(env);
 
   builder.create<fir::CallOp>(loc, startFn, args);
+
+#ifdef FLANG_CUDA_SUPPORT
+  if (initCuda) {
+    auto initFn = builder.createFunction(
+        loc, RTNAME_STRING(CUFInit), mlir::FunctionType::get(context, {}, {}));
+    builder.create<fir::CallOp>(loc, initFn);
+  }
+#endif
+
   builder.create<fir::CallOp>(loc, qqMainFn);
   builder.create<fir::CallOp>(loc, stopFn);
 
diff --git a/flang/runtime/CUDA/CMakeLists.txt b/flang/runtime/CUDA/CMakeLists.txt
index 23e01da72eded1..bfbae58086c1fd 100644
--- a/flang/runtime/CUDA/CMakeLists.txt
+++ b/flang/runtime/CUDA/CMakeLists.txt
@@ -17,6 +17,7 @@ add_flang_library(${CUFRT_LIBNAME}
   allocator.cpp
   allocatable.cpp
   descriptor.cpp
+  init.cpp
   kernel.cpp
   memmove-function.cpp
   memory.cpp
diff --git a/flang/runtime/CUDA/init.cpp b/flang/runtime/CUDA/init.cpp
new file mode 100644
index 00000000000000..2bffce842b9526
--- /dev/null
+++ b/flang/runtime/CUDA/init.cpp
@@ -0,0 +1,25 @@
+//===-- runtime/CUDA/init.cpp ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Runtime/CUDA/init.h"
+#include "../environment.h"
+#include "../terminator.h"
+#include "flang/Runtime/CUDA/common.h"
+
+#include "cuda_runtime.h"
+
+extern "C" {
+
+void RTDEF(CUFInit)() {
+  // Perform ctx initialization based on execution environment if necessary.
+  if (Fortran::runtime::executionEnvironment.cudaStackLimit) {
+    CUDA_REPORT_IF_ERROR(cudaDeviceSetLimit(cudaLimitStackSize,
+        Fortran::runtime::executionEnvironment.cudaStackLimit));
+  }
+}
+}
diff --git a/flang/runtime/environment.cpp b/flang/runtime/environment.cpp
index 52b1d99ba536ed..0f927587fb4f88 100644
--- a/flang/runtime/environment.cpp
+++ b/flang/runtime/environment.cpp
@@ -143,6 +143,17 @@ void ExecutionEnvironment::Configure(int ac, const char *av[],
     }
   }
 
+  if (auto *x{std::getenv("CUDA_STACKLIMIT")}) {
+    char *end;
+    auto n{std::strtol(x, &end, 10)};
+    if (n >= 0 && n < std::numeric_limits<int>::max() && *end == '\0') {
+      cudaStackLimit = n;
+    } else {
+      std::fprintf(stderr,
+          "Fortran runtime: CUDA_STACKLIMIT=%s is invalid; ignored\n", x);
+    }
+  }
+
   // TODO: Set RP/ROUND='PROCESSOR_DEFINED' from environment
 }
 
diff --git a/flang/runtime/environment.h b/flang/runtime/environment.h
index b8b9f10e4e57f5..184f0eb8653a65 100644
--- a/flang/runtime/environment.h
+++ b/flang/runtime/environment.h
@@ -56,6 +56,9 @@ struct ExecutionEnvironment {
   bool noStopMessage{false}; // NO_STOP_MESSAGE=1 inhibits "Fortran STOP"
   bool defaultUTF8{false}; // DEFAULT_UTF8
   bool checkPointerDeallocation{true}; // FORT_CHECK_POINTER_DEALLOCATION
+
+  // CUDA Fortran related variables
+  std::size_t cudaStackLimit{0}; // CUDA_STACKLIMIT
 };
 
 RT_OFFLOAD_VAR_GROUP_BEGIN

flang/runtime/environment.cpp

Meinersbur · 2025-01-29T16:27:50Z

This PR goes counter the initiative to make Flang and the runtime independent. That is, if the platform that Flang is compiled supports CUDA does not mean that the platform where the application runs (and the Flang-RT is compiled for) does.

Meinersbur · 2025-01-29T16:54:03Z

flang/lib/Optimizer/Builder/Runtime/Main.cpp

+
+#ifdef FLANG_CUDA_SUPPORT
+  if (initCuda) {
+    auto initFn = builder.createFunction(


What should happen in the following cases?

Flang is compiled with FLANG_CUDA_SUPPORT, compiles a CUDA program, which is then executed in an environment without CUFRuntime.so.

Flang is compiled without FLANG_CUDA_SUPPORT, compiles a CUDA program which is then executed in an environment without CUFRuntime.so.

Flang is compiled without FLANG_CUDA_SUPPORT, compiles a CUDA program which is then executed in an environment that supports CUDA. ACC_OFFLOAD_STACK_SIZE is just ignored?

Flang is compiled with FLANG_CUDA_SUPPORT, compiles a CUDA program, is statically linked to libCUFRuntime.a, which is then executed in an environment that does not support CUDA.

flang/include/flang/Runtime/CUDA/init.h

jeanPerier · 2025-01-29T16:54:42Z

flang/lib/Optimizer/Builder/Runtime/Main.cpp

 #include "flang/Runtime/stop.h"
+#ifdef FLANG_CUDA_SUPPORT
+#include "flang/Runtime/CUDA/init.h"
+#endif


What is the reason for protecting this include with an ifdef, if this header contains only a runtime signature, it does not seem like it require some CUDA support when building flang.

The initCuda dynamic flag seems enough to me to control the feature.

The header is not present if the cuda fortran rubtime is not compiled

Whether the build configuration compiles CUFRuntime or not, init.h is still present in the git checkout.

Yeah that's correct

clementval · 2025-01-29T17:22:03Z

This PR goes counter the initiative to make Flang and the runtime independent. That is, if the platform that Flang is compiled supports CUDA does not mean that the platform where the application runs (and the Flang-RT is compiled for) does.

Let me update this to be more in line with your work

Meinersbur · 2025-01-29T17:29:08Z

Can we revert this in the meantime?

clementval · 2025-01-29T17:30:32Z

Can we revert this in the meantime?

I'm gonna have smth ready within the next hour or so and afaict it doesn't break any builbot

…it (#124965) This patch addresses post commit review comments from #124859. The extra compile definition is not necessary and goes against the effort to separate the runtimes from the flang compiler itself. The function declaration for `CUFInit` can be accessed anyway since the header are always present. The insertion of the call is only based on the language feature options from the folding context. A program compiled with cuda enabled but no cufruntime would just fail at link time as expected.

[flang][cuda] Allow to set the stack limit size

3641398

clementval requested review from Renaud-K, klausler and wangzpgi January 28, 2025 23:55

llvmbot added flang:runtime flang Flang issues not falling into any other category flang:fir-hlfir labels Jan 28, 2025

klausler approved these changes Jan 28, 2025

View reviewed changes

flang/runtime/environment.cpp Outdated Show resolved Hide resolved

clementval added 2 commits January 28, 2025 16:09

Update comparison

b3bcc4e

Use ACC_OFFLOAD_STACK_SIZE

af6368d

clementval merged commit 654b763 into llvm:main Jan 29, 2025
8 checks passed

clementval deleted the cuf_init branch January 29, 2025 04:57

Meinersbur requested a review from jeanPerier January 29, 2025 16:40

Meinersbur reviewed Jan 29, 2025

View reviewed changes

jeanPerier reviewed Jan 29, 2025

View reviewed changes

clementval mentioned this pull request Jan 29, 2025

[flang][cuda] Remove the need of special compile definition for CUFInit #124965

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[flang][cuda] Allow to set the stack limit size #124859

[flang][cuda] Allow to set the stack limit size #124859

Uh oh!

clementval commented Jan 28, 2025 •

edited

Loading

Uh oh!

llvmbot commented Jan 28, 2025 •

edited

Loading

Uh oh!

Uh oh!

Uh oh!

Meinersbur commented Jan 29, 2025

Uh oh!

Meinersbur Jan 29, 2025 •

edited

Loading

Uh oh!

Uh oh!

jeanPerier Jan 29, 2025

Uh oh!

clementval Jan 29, 2025

Uh oh!

Meinersbur Jan 29, 2025 •

edited

Loading

Uh oh!

clementval Jan 29, 2025

Uh oh!

clementval commented Jan 29, 2025

Uh oh!

Meinersbur commented Jan 29, 2025

Uh oh!

clementval commented Jan 29, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

5 participants

[flang][cuda] Allow to set the stack limit size #124859

[flang][cuda] Allow to set the stack limit size #124859

Uh oh!

Conversation

clementval commented Jan 28, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Jan 28, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Meinersbur commented Jan 29, 2025

Uh oh!

Meinersbur Jan 29, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

jeanPerier Jan 29, 2025

Choose a reason for hiding this comment

Uh oh!

clementval Jan 29, 2025

Choose a reason for hiding this comment

Uh oh!

Meinersbur Jan 29, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

clementval Jan 29, 2025

Choose a reason for hiding this comment

Uh oh!

clementval commented Jan 29, 2025

Uh oh!

Meinersbur commented Jan 29, 2025

Uh oh!

clementval commented Jan 29, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

5 participants

clementval commented Jan 28, 2025 •

edited

Loading

llvmbot commented Jan 28, 2025 •

edited

Loading

Meinersbur Jan 29, 2025 •

edited

Loading

Meinersbur Jan 29, 2025 •

edited

Loading