Skip to content

Commit 74180eb

Browse files
authored
[flang][rt] Add noinline attributes for CUDA compile path for successful compilation (#161760)
NVCC does more aggressive inlining than Clang/GCC causing the exported functions in extrema.cpp and findloc.cpp to become extremely large from function specializations leading to compilation timeouts. Marking the 2 functions in this change as noinline for NVCC alleviates this problem as it removes the worst of the cross-matrix argument specializations. Also remove the workaround in #156542 that opted out findloc.cpp from the CUDA flang-rt build Testing: ninja flang-rt builds in ~30 minutes, these 2 files build in ~3 minutes
1 parent d0e9890 commit 74180eb

File tree

3 files changed

+13
-10
lines changed

3 files changed

+13
-10
lines changed

flang-rt/lib/runtime/CMakeLists.txt

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -178,9 +178,6 @@ endif ()
178178
if ("${LLVM_RUNTIMES_TARGET}" MATCHES "^amdgcn|^nvptx")
179179
set(sources ${gpu_sources})
180180
elseif(FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT STREQUAL "CUDA")
181-
# findloc.cpp has some issues with higher compute capability. Remove it
182-
# from CUDA build until we can lower its memory footprint.
183-
list(REMOVE_ITEM supported_sources findloc.cpp)
184181
set(sources ${supported_sources})
185182
else ()
186183
set(sources ${supported_sources} ${host_sources} ${f128_sources})

flang-rt/lib/runtime/extrema.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -397,9 +397,12 @@ template <TypeCategory CAT, bool IS_MAX,
397397
template <typename, bool, bool> class COMPARE>
398398
struct DoPartialMaxOrMinLocHelper {
399399
template <int KIND> struct Functor {
400-
RT_API_ATTRS void operator()(const char *intrinsic, Descriptor &result,
401-
const Descriptor &x, int kind, int dim, const Descriptor *mask,
402-
bool back, Terminator &terminator) const {
400+
// NVCC inlines more aggressively which causes too many specializations of
401+
// this function to be inlined causing compiler timeouts. Set as
402+
// noinline to allow compilation to complete.
403+
RT_API_ATTRS RT_DEVICE_NOINLINE void operator()(const char *intrinsic,
404+
Descriptor &result, const Descriptor &x, int kind, int dim,
405+
const Descriptor *mask, bool back, Terminator &terminator) const {
403406
DoPartialMaxOrMinLoc<CAT, KIND, IS_MAX, COMPARE>(
404407
intrinsic, result, x, kind, dim, mask, back, terminator);
405408
}

flang-rt/lib/runtime/findloc.cpp

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -153,10 +153,13 @@ template <TypeCategory CAT,
153153
class HELPER>
154154
struct NumericFindlocHelper {
155155
template <int KIND> struct Functor {
156-
RT_API_ATTRS void operator()(TypeCategory targetCat, int targetKind,
157-
Descriptor &result, const Descriptor &x, const Descriptor &target,
158-
int kind, int dim, const Descriptor *mask, bool back,
159-
Terminator &terminator) const {
156+
// NVCC inlines more aggressively which causes too many specializations of
157+
// this function to be inlined causing compiler timeouts. Set as
158+
// noinline to allow compilation to complete.
159+
RT_API_ATTRS RT_DEVICE_NOINLINE void operator()(TypeCategory targetCat,
160+
int targetKind, Descriptor &result, const Descriptor &x,
161+
const Descriptor &target, int kind, int dim, const Descriptor *mask,
162+
bool back, Terminator &terminator) const {
160163
switch (targetCat) {
161164
case TypeCategory::Integer:
162165
case TypeCategory::Unsigned:

0 commit comments

Comments
 (0)