Skip to content
Merged
21 changes: 17 additions & 4 deletions clang/include/clang/Driver/Options.td
Original file line number Diff line number Diff line change
Expand Up @@ -3835,10 +3835,23 @@ let Visibility = [ClangOption, CC1Option, FC1Option, FlangOption] in {
let Group = f_Group in {

def fopenmp_target_debug_EQ : Joined<["-"], "fopenmp-target-debug=">;
def fopenmp_assume_teams_oversubscription : Flag<["-"], "fopenmp-assume-teams-oversubscription">;
def fopenmp_assume_threads_oversubscription : Flag<["-"], "fopenmp-assume-threads-oversubscription">;
def fno_openmp_assume_teams_oversubscription : Flag<["-"], "fno-openmp-assume-teams-oversubscription">;
def fno_openmp_assume_threads_oversubscription : Flag<["-"], "fno-openmp-assume-threads-oversubscription">;
def fopenmp_assume_teams_oversubscription : Flag<["-"], "fopenmp-assume-teams-oversubscription">,
HelpText<"Allow enforcement to ensure there are enough teams to cover the "
"loop iteration space. It may ignore environment variables. "
"If the fopenmp-assume-teams-oversubscription and "
"fopenmp-assume-threads-oversubscription flags are set, Flang may "
"generate more optimized OpenMP kernels for target teams distribute "
"parallel do pragmas.">;
def fopenmp_assume_threads_oversubscription : Flag<["-"], "fopenmp-assume-threads-oversubscription">,
HelpText<"Assume threads oversubscription. If the "
"fopenmp-assume-teams-oversubscription and "
"fopenmp-assume-threads-oversubscription flags are set, Flang may "
"generate more optimized OpenMP kernels for target teams distribute "
"parallel do pragmas.">;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I find these HelpTexts more confusing than helpful. Proposal:

Allow the optimizer to discretely increase the number of teams/threads. May cause ignore environment variables that set the number of teams/threads to be ignored. The combination of -fopenmp-assume-teams-oversubscription and -fopenmp-assume-threads-oversubscription may allow the conversion of loops into sequential code by ensuring that each team/thread executes at most one iteration.

(please adapt as you see fit)

The name "assume_threads/teams_oversubscription" seems disconnected from this function at first, but I think it comes from allowing oversubscription, where the "subscription" is the number of available teams/threads as determined by thread_limit, num_teams, OMP_THREAD_LIMIT, etc., and threfore allowing the optimzer to oversubscribe this amount. Maybe there is a way to make the description more connected to the flag name.

Avoid "Flang" in the HelpText. It might also made available in Clang on day.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the feedback. I changed the description.

def fno_openmp_assume_teams_oversubscription : Flag<["-"], "fno-openmp-assume-teams-oversubscription">,
HelpText<"Do not assume teams oversubscription.">;
def fno_openmp_assume_threads_oversubscription : Flag<["-"], "fno-openmp-assume-threads-oversubscription">,
HelpText<"Do not assume threads oversubscription.">;
def fopenmp_assume_no_thread_state : Flag<["-"], "fopenmp-assume-no-thread-state">,
HelpText<"Assert no thread in a parallel region modifies an ICV">,
MarshallingInfoFlag<LangOpts<"OpenMPNoThreadState">>;
Expand Down
8 changes: 6 additions & 2 deletions llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -1075,11 +1075,13 @@ class OpenMPIRBuilder {
/// preheader of the loop.
/// \param LoopType Information about type of loop worksharing.
/// It corresponds to type of loop workshare OpenMP pragma.
/// \param NoLoop If true, no-loop code is generated.
///
/// \returns Point where to insert code after the workshare construct.
InsertPointTy applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
InsertPointTy AllocaIP,
omp::WorksharingLoopType LoopType);
omp::WorksharingLoopType LoopType,
bool NoLoop);

/// Modifies the canonical loop to be a statically-scheduled workshare loop.
///
Expand Down Expand Up @@ -1199,6 +1201,7 @@ class OpenMPIRBuilder {
/// present.
/// \param LoopType Information about type of loop worksharing.
/// It corresponds to type of loop workshare OpenMP pragma.
/// \param NoLoop If true, no-loop code is generated.
///
/// \returns Point where to insert code after the workshare construct.
LLVM_ABI InsertPointOrErrorTy applyWorkshareLoop(
Expand All @@ -1209,7 +1212,8 @@ class OpenMPIRBuilder {
bool HasMonotonicModifier = false, bool HasNonmonotonicModifier = false,
bool HasOrderedClause = false,
omp::WorksharingLoopType LoopType =
omp::WorksharingLoopType::ForStaticLoop);
omp::WorksharingLoopType::ForStaticLoop,
bool NoLoop = false);

/// Tile a loop nest.
///
Expand Down
24 changes: 12 additions & 12 deletions llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4975,7 +4975,7 @@ static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder,
WorksharingLoopType LoopType,
BasicBlock *InsertBlock, Value *Ident,
Value *LoopBodyArg, Value *TripCount,
Function &LoopBodyFn) {
Function &LoopBodyFn, bool NoLoop) {
Type *TripCountTy = TripCount->getType();
Module &M = OMPBuilder->M;
IRBuilder<> &Builder = OMPBuilder->Builder;
Expand Down Expand Up @@ -5003,16 +5003,17 @@ static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder,
RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
}
RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), NoLoop));
} else
RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
} else
RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
} else {
RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
}

if/else chains should not use braced bodies for either all or none of its members

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done


Builder.CreateCall(RTLFn, RealArgs);
}

static void workshareLoopTargetCallback(
OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident,
Function &OutlinedFn, const SmallVector<Instruction *, 4> &ToBeDeleted,
WorksharingLoopType LoopType) {
WorksharingLoopType LoopType, bool NoLoop) {
IRBuilder<> &Builder = OMPIRBuilder->Builder;
BasicBlock *Preheader = CLI->getPreheader();
Value *TripCount = CLI->getTripCount();
Expand Down Expand Up @@ -5059,17 +5060,16 @@ static void workshareLoopTargetCallback(
OutlinedFnCallInstruction->eraseFromParent();

createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
LoopBodyArg, TripCount, OutlinedFn);
LoopBodyArg, TripCount, OutlinedFn, NoLoop);

for (auto &ToBeDeletedItem : ToBeDeleted)
ToBeDeletedItem->eraseFromParent();
CLI->invalidate();
}

OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
InsertPointTy AllocaIP,
WorksharingLoopType LoopType) {
OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget(
DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
WorksharingLoopType LoopType, bool NoLoop) {
uint32_t SrcLocStrSize;
Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
Expand Down Expand Up @@ -5152,7 +5152,7 @@ OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
OI.PostOutlineCB = [=, ToBeDeletedVec =
std::move(ToBeDeleted)](Function &OutlinedFn) {
workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ToBeDeletedVec,
LoopType);
LoopType, NoLoop);
};
addOutlineInfo(std::move(OI));
return CLI->getAfterIP();
Expand All @@ -5163,9 +5163,9 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyWorkshareLoop(
bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
bool HasSimdModifier, bool HasMonotonicModifier,
bool HasNonmonotonicModifier, bool HasOrderedClause,
WorksharingLoopType LoopType) {
WorksharingLoopType LoopType, bool NoLoop) {
if (Config.isTargetDevice())
return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType);
return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType, NoLoop);
OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
HasNonmonotonicModifier, HasOrderedClause);
Expand Down
4 changes: 3 additions & 1 deletion mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td
Original file line number Diff line number Diff line change
Expand Up @@ -230,14 +230,16 @@ def TargetRegionFlagsNone : I32BitEnumAttrCaseNone<"none">;
def TargetRegionFlagsGeneric : I32BitEnumAttrCaseBit<"generic", 0>;
def TargetRegionFlagsSpmd : I32BitEnumAttrCaseBit<"spmd", 1>;
def TargetRegionFlagsTripCount : I32BitEnumAttrCaseBit<"trip_count", 2>;
def TargetRegionFlagsNoLoop : I32BitEnumAttrCaseBit<"no_loop", 3>;

def TargetRegionFlags : OpenMP_BitEnumAttr<
"TargetRegionFlags",
"target region property flags", [
TargetRegionFlagsNone,
TargetRegionFlagsGeneric,
TargetRegionFlagsSpmd,
TargetRegionFlagsTripCount
TargetRegionFlagsTripCount,
TargetRegionFlagsNoLoop
]>;

//===----------------------------------------------------------------------===//
Expand Down
39 changes: 34 additions & 5 deletions mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2106,6 +2106,29 @@ Operation *TargetOp::getInnermostCapturedOmpOp() {
});
}

/// Check if we can promote SPMD kernel to No-Loop kernel.
static bool canPromoteToNoLoop(Operation *capturedOp, TeamsOp teamsOp,
WsloopOp *wsLoopOp) {
// num_teams clause can break no-loop teams/threads assumption.
if (teamsOp.getNumTeamsUpper())
return false;
// Reduction kernels are slower in no-loop mode.
if (teamsOp.getNumReductionVars())
return false;
if (wsLoopOp->getNumReductionVars())
return false;
// Check if the user allows the promotion of kernels to no-loop mode.
OffloadModuleInterface offloadMod =
capturedOp->getParentOfType<omp::OffloadModuleInterface>();
if (!offloadMod)
return false;
auto ompFlags = offloadMod.getFlags();
if (!ompFlags)
return false;
return ompFlags.getAssumeTeamsOversubscription() &&
ompFlags.getAssumeThreadsOversubscription();
}

TargetRegionFlags TargetOp::getKernelExecFlags(Operation *capturedOp) {
// A non-null captured op is only valid if it resides inside of a TargetOp
// and is the result of calling getInnermostCapturedOmpOp() on it.
Expand Down Expand Up @@ -2134,7 +2157,8 @@ TargetRegionFlags TargetOp::getKernelExecFlags(Operation *capturedOp) {

// Detect target-teams-distribute-parallel-wsloop[-simd].
if (numWrappers == 2) {
if (!isa<WsloopOp>(innermostWrapper))
WsloopOp *wsloopOp = dyn_cast<WsloopOp>(innermostWrapper);
if (!wsloopOp)
return TargetRegionFlags::generic;

innermostWrapper = std::next(innermostWrapper);
Expand All @@ -2145,12 +2169,17 @@ TargetRegionFlags TargetOp::getKernelExecFlags(Operation *capturedOp) {
if (!isa_and_present<ParallelOp>(parallelOp))
return TargetRegionFlags::generic;

Operation *teamsOp = parallelOp->getParentOp();
if (!isa_and_present<TeamsOp>(teamsOp))
TeamsOp teamsOp = dyn_cast<TeamsOp>(parallelOp->getParentOp());
if (!teamsOp)
return TargetRegionFlags::generic;

if (teamsOp->getParentOp() == targetOp.getOperation())
return TargetRegionFlags::spmd | TargetRegionFlags::trip_count;
if (teamsOp->getParentOp() == targetOp.getOperation()) {
TargetRegionFlags result =
TargetRegionFlags::spmd | TargetRegionFlags::trip_count;
if (canPromoteToNoLoop(capturedOp, teamsOp, wsloopOp))
result = result | TargetRegionFlags::no_loop;
return result;
}
}
// Detect target-teams-distribute[-simd] and target-teams-loop.
else if (isa<DistributeOp, LoopOp>(innermostWrapper)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2590,13 +2590,34 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
}

builder.SetInsertPoint(*regionBlock, (*regionBlock)->begin());

// Check if we can generate no-loop kernel
bool noLoopMode = false;
omp::TargetOp targetOp = wsloopOp->getParentOfType<mlir::omp::TargetOp>();
if (targetOp) {
Operation *targetCapturedOp = targetOp.getInnermostCapturedOmpOp();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we check here that the captured op is the omp.loop_nest wrapped by this omp.wsloop? The current implementation will set noLoopMode = true for both omp.wsloops generated by this code:

!$omp target teams distribute parallel do
do i=1,10
  !$omp do
  do j=1,i
  end do
end do

I'm not actually sure about what is the expected behavior of this, but I imagine that no-loop would just refer to the outer loop, as it's the one for which the trip count can be evaluated in the host.

Copy link
Contributor Author

@DominikAdamski DominikAdamski Sep 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi Sergio,
thanks for your feedback:

Should we check here that the captured op is the omp.loop_nest wrapped by this omp.wsloop?

Yes. There is assumption in TargetRegionFlags TargetOp::getKernelExecFlags which checks if the argument is is the result of calling getInnermostCapturedOmpOp().

I'm not actually sure about what is the expected behavior of this, but I imagine that no-loop would just refer to the outer loop, as it's the one for which the trip count can be evaluated in the host.

There are 2 issues:

  1. Yes, your guess is right. The no-loop should refer only to the outer loop. I modified the OMPIRBuilder code to generate no-loop code only for distribute for static loop: 564410d#diff-a6c8db9d350ec59f4eb93f27f29468b01c9590426a11c9cb79e499bc96b28adc
  2. Currently Flang does not generate valid code for OpenMP kernel (the issue is not related to my patch):
!$omp target teams distribute parallel do
do i = 0,15
 !$omp parallel do
    do j = 1, 64
      array(i* 64 + j) = i *64 + j
    end do
 !$omp end parallel do
end do

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for the clarifications, Dominik.

Yes. There is assumption in TargetRegionFlags TargetOp::getKernelExecFlags which checks if the argument is is the result of calling getInnermostCapturedOmpOp().

What I mean is adding something like this:

Operation *targetCapturedOp = targetOp.getInnermostCapturedOmpOp();
if (*loopOp == targetCapturedOp) {
  omp::TargetRegionFlags kernelFlags = targetOp.getKernelExecFlags(targetCapturedOp);
  ...
}

We need that because, if not, noLoopMode will also be set to true for every omp.wsloop nested inside of a no-loop SPMD target region, even if that loop is not the top-level SPMD one.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in commit: d2e88db

// We need this check because, without it, noLoopMode would be set to true
// for every omp.wsloop nested inside a no-loop SPMD target region, even if
// that loop is not the top-level SPMD one.
if (loopOp == targetCapturedOp) {
omp::TargetRegionFlags kernelFlags =
targetOp.getKernelExecFlags(targetCapturedOp);
if (omp::bitEnumContainsAll(kernelFlags,
omp::TargetRegionFlags::spmd |
omp::TargetRegionFlags::no_loop) &&
!omp::bitEnumContainsAny(kernelFlags,
omp::TargetRegionFlags::generic))
noLoopMode = true;
}
}

llvm::OpenMPIRBuilder::InsertPointOrErrorTy wsloopIP =
ompBuilder->applyWorkshareLoop(
ompLoc.DL, loopInfo, allocaIP, loopNeedsBarrier,
convertToScheduleKind(schedule), chunk, isSimd,
scheduleMod == omp::ScheduleModifier::monotonic,
scheduleMod == omp::ScheduleModifier::nonmonotonic, isOrdered,
workshareLoopType);
workshareLoopType, noLoopMode);

if (failed(handleError(wsloopIP, opInst)))
return failure();
Expand Down Expand Up @@ -5375,6 +5396,12 @@ initTargetDefaultAttrs(omp::TargetOp targetOp, Operation *capturedOp,
? llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD
: llvm::omp::OMP_TGT_EXEC_MODE_GENERIC
: llvm::omp::OMP_TGT_EXEC_MODE_SPMD;
if (omp::bitEnumContainsAll(kernelFlags,
omp::TargetRegionFlags::spmd |
omp::TargetRegionFlags::no_loop) &&
!omp::bitEnumContainsAny(kernelFlags, omp::TargetRegionFlags::generic))
attrs.ExecFlags = llvm::omp::OMP_TGT_EXEC_MODE_SPMD_NO_LOOP;

attrs.MinTeams = minTeamsVal;
attrs.MaxTeams.front() = maxTeamsVal;
attrs.MinThreads = 1;
Expand Down
96 changes: 96 additions & 0 deletions offload/test/offloading/fortran/target-no-loop.f90
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
! REQUIRES: flang, amdgpu
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
! REQUIRES: flang, amdgpu
! REQUIRES: flang

Shouldn't this still be correct with nvidia GPUs?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it should be correct with Nvida GPU. I fixed the test case.


! RUN: %libomptarget-compile-fortran-generic -O3 -fopenmp-assume-threads-oversubscription -fopenmp-assume-teams-oversubscription
! RUN: env LIBOMPTARGET_INFO=16 OMP_NUM_TEAMS=16 OMP_TEAMS_THREAD_LIMIT=16 %libomptarget-run-generic 2>&1 | %fcheck-generic
function check_errors(array) result (errors)
integer, intent(in) :: array(1024)
integer :: errors
integer :: i
errors = 0
do i = 1, 1024
if ( array( i) .ne. (i) ) then
errors = errors + 1
end if
end do
end function

program main
use omp_lib
implicit none
integer :: i,j,red
integer :: array(1024), errors = 0
array = 1

! No-loop kernel
!$omp target teams distribute parallel do
do i = 1, 1024
array(i) = i
end do
errors = errors + check_errors(array)

! SPMD kernel (num_teams clause blocks promotion to no-loop)
array = 1
!$omp target teams distribute parallel do num_teams(3)
do i = 1, 1024
array(i) = i
end do

errors = errors + check_errors(array)

! No-loop kernel
array = 1
!$omp target teams distribute parallel do num_threads(64)
do i = 1, 1024
array(i) = i
end do

errors = errors + check_errors(array)

! SPMD kernel
array = 1
!$omp target parallel do
do i = 1, 1024
array(i) = i
end do

errors = errors + check_errors(array)

! Generic kernel
array = 1
!$omp target teams distribute
do i = 1, 1024
array(i) = i
end do

errors = errors + check_errors(array)

! SPMD kernel (reduction clause blocks promotion to no-loop)
array = 1
red =0
!$omp target teams distribute parallel do reduction(+:red)
do i = 1, 1024
red = red + array(i)
end do

if (red .ne. 1024) then
errors = errors + 1
end if

print *,"number of errors: ", errors

end program main

! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} SPMD-No-Loop mode
! CHECK: info: #Args: 3 Teams x Thrds: 64x 16
! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} SPMD mode
! CHECK: info: #Args: 3 Teams x Thrds: 3x 16 {{.*}}
! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} SPMD-No-Loop mode
! CHECK: info: #Args: 3 Teams x Thrds: 64x 16 {{.*}}
! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} SPMD mode
! CHECK: info: #Args: 3 Teams x Thrds: 1x 16
! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} Generic mode
! CHECK: info: #Args: 3 Teams x Thrds: 16x 16 {{.*}}
! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} SPMD mode
! CHECK: info: #Args: 4 Teams x Thrds: 16x 16 {{.*}}
! CHECK: number of errors: 0

13 changes: 0 additions & 13 deletions openmp/device/src/Workshare.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -800,10 +800,6 @@ template <typename Ty> class StaticLoopChunker {

// If we know we have more threads than iterations we can indicate that to
// avoid an outer loop.
if (config::getAssumeThreadsOversubscription()) {
OneIterationPerThread = true;
}

if (OneIterationPerThread)
ASSERT(NumThreads >= NumIters, "Broken assumption");

Expand Down Expand Up @@ -851,10 +847,6 @@ template <typename Ty> class StaticLoopChunker {

// If we know we have more blocks than iterations we can indicate that to
// avoid an outer loop.
if (config::getAssumeTeamsOversubscription()) {
OneIterationPerThread = true;
}

if (OneIterationPerThread)
ASSERT(NumBlocks >= NumIters, "Broken assumption");

Expand Down Expand Up @@ -914,11 +906,6 @@ template <typename Ty> class StaticLoopChunker {

// If we know we have more threads (across all blocks) than iterations we
// can indicate that to avoid an outer loop.
if (config::getAssumeTeamsOversubscription() &
config::getAssumeThreadsOversubscription()) {
OneIterationPerThread = true;
}

if (OneIterationPerThread)
ASSERT(NumBlocks * NumThreads >= NumIters, "Broken assumption");

Expand Down