[flang-rt] Add experimental support for GPU build

jhuber6 · jhuber6 · commit 125c5b027716 · 2025-03-18T10:25:41.000-05:00
Summary: This patch adds initial support for compiling `flang-rt` directly for the GPU. The method used here matches what's already done for `libc` and `libc++` for the GPU and builds off of those projects. Mainly this requires setting up some flags and setting the sources that currently work. This will deposit the resulting library in the appropriate directory. These files are then intended to be linked via `-Xoffload-linker` support in the offloading driver. ``` lib/clang/21/lib/nvptx64-nvidia-cuda/libflang_rt.runtime.a lib/clang/21/lib/amdgcn-amd-amdhsa/libflang_rt.runtime.a ``` This is obviously missing a lot of functions, mainly the `io` support. Most of what we cannot support is due to using POSIX things that just don't make sense on the GPU. Stuff like `pthreads` or `sema`. Getting unit tests to run on this will also be a challenge. We could run tests the same way we do with `libc`, but the problem there is that the `libc` test suite is freestanding while `gtest` currently doesn't compile on the GPU bcause it uses a lot of weird stuff. If the unit tests were simply `int main` then it would work. I don't understand the actual runtime code very well, I'd appreciate some guidance on how to actually support Fortran IO from this interface. As I understand it, Fortran IO requires a stack-like operation, which conflicts with the SIMT model GPUs use. Worst case scenario we could burn some LDS to keep a stack, or serialize it somehow since we can always just iterate over all the active lanes. Building this right now looks like this, which depends on the arguments added in llvm#131695. ``` -DRUNTIMES_nvptx64-nvidia-cuda_LLVM_ENABLE_RUNTIMES=compiler-rt;libc;libcxx;libcxxabi;flang-rt \ -DRUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES=compiler-rt;libc;libcxx;libcxxabi;flang-rt \ -DRUNTIMES_nvptx64-nvidia-cuda_FLANG_RT_LIBC_PROVIDER=llvm \ -DRUNTIMES_nvptx64-nvidia-cuda_FLANG_RT_LIBCXX_PROVIDER=llvm \ -DRUNTIMES_amdgcn-amd-amdhsa_FLANG_RT_LIBC_PROVIDER=llvm \ -DRUNTIMES_amdgcn-amd-amdhsa_FLANG_RT_LIBCXX_PROVIDER=llvm ```
diff --git a/flang-rt/CMakeLists.txt b/flang-rt/CMakeLists.txt
@@ -210,6 +210,15 @@ endif()
 # System Introspection #
 ########################
 
+# The GPU targets require a few mandatory arguments to make the standard CMake
+# check flags happy.
+if ("${LLVM_RUNTIMES_TARGET}" MATCHES "^amdgcn")
+  set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -nogpulib")
+elseif ("${LLVM_RUNTIMES_TARGET}" MATCHES "^nvptx")
+  set(CMAKE_REQUIRED_FLAGS
+      "${CMAKE_REQUIRED_FLAGS} -flto -c -Wno-unused-command-line-argument")
+endif()
+
 include(CheckCXXSymbolExists)
 include(CheckCXXSourceCompiles)
 check_cxx_symbol_exists(strerror_r string.h HAVE_STRERROR_R)
diff --git a/flang-rt/cmake/modules/AddFlangRT.cmake b/flang-rt/cmake/modules/AddFlangRT.cmake
@@ -209,6 +209,13 @@ function (add_flangrt_library name)
     # Minimum required C++ version for Flang-RT, even if CMAKE_CXX_STANDARD is defined to something else.
     target_compile_features(${tgtname} PRIVATE cxx_std_17)
 
+    # When building the flang runtime if LTO is enabled the archive file
+    # contains LLVM IR rather than object code. Currently flang is not
+    # LTO aware so cannot link this file to compiled Fortran code.
+    if (FLANG_RT_HAS_FNO_LTO_FLAG)
+      target_compile_options(${tgtname} PRIVATE -fno-lto)
+    endif ()
+
     # Use compiler-specific options to disable exceptions and RTTI.
     if (LLVM_COMPILER_IS_GCC_COMPATIBLE)
       target_compile_options(${tgtname} PRIVATE
@@ -224,6 +231,17 @@ function (add_flangrt_library name)
         )
     endif ()
 
+    # Add target specific options if necessary.
+    if ("${LLVM_RUNTIMES_TARGET}" MATCHES "^amdgcn")
+      target_compile_options(${tgtname} PRIVATE
+          $<$<COMPILE_LANGUAGE:CXX>:-nogpulib -flto -fvisibility=hidden>
+        )
+    elseif ("${LLVM_RUNTIMES_TARGET}" MATCHES "^nvptx")
+      target_compile_options(${tgtname} PRIVATE
+          $<$<COMPILE_LANGUAGE:CXX>:-nogpulib -flto -fvisibility=hidden -Wno-unknown-cuda-version --cuda-feature=+ptx63>
+        )
+    endif ()
+
     # Also for CUDA source when compiling with FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT=CUDA
     if (CMAKE_CUDA_COMPILER_ID MATCHES "NVIDIA")
       # Assuming gcc as host compiler.
@@ -254,13 +272,6 @@ function (add_flangrt_library name)
       target_compile_options(${tgtname} PUBLIC -U_LIBCPP_ENABLE_ASSERTIONS)
     endif ()
 
-    # When building the flang runtime if LTO is enabled the archive file
-    # contains LLVM IR rather than object code. Currently flang is not
-    # LTO aware so cannot link this file to compiled Fortran code.
-    if (FLANG_RT_HAS_FNO_LTO_FLAG)
-      target_compile_options(${tgtname} PRIVATE -fno-lto)
-    endif ()
-
     # Flang/Clang (including clang-cl) -compiled programs targeting the MSVC ABI
     # should only depend on msvcrt/ucrt. LLVM still emits libgcc/compiler-rt
     # functions in some cases like 128-bit integer math (__udivti3, __modti3,
diff --git a/flang-rt/lib/runtime/CMakeLists.txt b/flang-rt/lib/runtime/CMakeLists.txt
@@ -12,7 +12,6 @@ find_package(Backtrace)
 set(HAVE_BACKTRACE ${Backtrace_FOUND})
 set(BACKTRACE_HEADER ${Backtrace_HEADER})
 
-
 # List of files that are buildable for all devices.
 set(supported_sources
   ${FLANG_SOURCE_DIR}/lib/Decimal/binary-to-decimal.cpp
@@ -88,6 +87,54 @@ set(host_sources
   unit-map.cpp
 )
 
+# Sources that can be compiled directly for the GPU.
+set(gpu_sources
+  ${FLANG_SOURCE_DIR}/lib/Decimal/binary-to-decimal.cpp
+  ${FLANG_SOURCE_DIR}/lib/Decimal/decimal-to-binary.cpp
+  ISO_Fortran_binding.cpp
+  allocator-registry.cpp
+  allocatable.cpp
+  array-constructor.cpp
+  assign.cpp
+  buffer.cpp
+  character.cpp
+  connection.cpp
+  copy.cpp
+  derived-api.cpp
+  derived.cpp
+  dot-product.cpp
+  edit-output.cpp
+  extrema.cpp
+  findloc.cpp
+  format.cpp
+  inquiry.cpp
+  internal-unit.cpp
+  io-error.cpp
+  iostat.cpp
+  matmul-transpose.cpp
+  matmul.cpp
+  memory.cpp
+  misc-intrinsic.cpp
+  non-tbp-dio.cpp
+  numeric.cpp
+  pointer.cpp
+  product.cpp
+  ragged.cpp
+  stat.cpp
+  sum.cpp
+  support.cpp
+  terminator.cpp
+  tools.cpp
+  transformational.cpp
+  type-code.cpp
+  type-info.cpp
+  utf.cpp
+  complex-powi.cpp
+  reduce.cpp
+  reduction.cpp
+  temporary-stack.cpp
+)
+
 file(GLOB_RECURSE public_headers
   "${FLANG_RT_SOURCE_DIR}/include/flang_rt/*.h"
   "${FLANG_SOURCE_DIR}/include/flang/Common/*.h"
@@ -124,7 +171,11 @@ else ()
   set(f128_sources "")
 endif ()
 
-set(sources ${supported_sources} ${host_sources} ${f128_sources})
+if ("${LLVM_RUNTIMES_TARGET}" MATCHES "^amdgcn|^nvptx")
+  set(sources ${gpu_sources})
+else ()
+  set(sources ${supported_sources} ${host_sources} ${f128_sources})
+endif ()
 
 
 if (NOT WIN32)
diff --git a/flang/cmake/modules/FlangCommon.cmake b/flang/cmake/modules/FlangCommon.cmake
@@ -24,6 +24,13 @@ if (FLANG_RUNTIME_F128_MATH_LIB)
   add_compile_definitions(FLANG_RUNTIME_F128_MATH_LIB="${FLANG_RUNTIME_F128_MATH_LIB}")
 endif()
 
+# The NVPTX target can't emit a binary due to the PTXAS dependency, just
+# hard-code this.
+if ("${LLVM_RUNTIMES_TARGET}" MATCHES "^nvptx")
+  add_compile_definitions(FLANG_LITTLE_ENDIAN=1)
+  return()
+endif ()
+
 # Check if 128-bit float computations can be done via long double
 # Note that '-nostdinc++' might be implied when this code kicks in
 # (see 'runtimes/CMakeLists.txt'), so we cannot use 'cfloat' C++ header