diff --git a/flang/docs/OpenMPSupport.md b/flang/docs/OpenMPSupport.md index 81f5f9f6dee5b..8eea39c6ba91b 100644 --- a/flang/docs/OpenMPSupport.md +++ b/flang/docs/OpenMPSupport.md @@ -42,10 +42,10 @@ Note : No distinction is made between the support in Parser/Semantics, MLIR, Low | target update construct | P | device clause not supported | | declare target directive | P | | | teams construct | Y | | -| distribute construct | P | dist_schedule clause not supported | -| distribute simd construct | P | dist_schedule and linear clauses are not supported | -| distribute parallel loop construct | P | dist_schedule clause not supported | -| distribute parallel loop simd construct | P | dist_schedule and linear clauses are not supported | +| distribute construct | P | | +| distribute simd construct | P | linear clauses are not supported | +| distribute parallel loop construct | P | | +| distribute parallel loop simd construct | P | linear clauses are not supported | | depend clause | Y | | | declare reduction construct | N | | | atomic construct extensions | Y | | @@ -53,13 +53,13 @@ Note : No distinction is made between the support in Parser/Semantics, MLIR, Low | cancellation point construct | Y | | | parallel do simd construct | P | linear clause not supported | | target teams construct | P | device clause not supported | -| teams distribute construct | P | dist_schedule clause not supported | -| teams distribute simd construct | P | dist_schedule and linear clauses are not supported | -| target teams distribute construct | P | device and dist_schedule clauses are not supported | -| teams distribute parallel loop construct | P | dist_schedule clause not supported | -| target teams distribute parallel loop construct | P | device and dist_schedule clauses are not supported | -| teams distribute parallel loop simd construct | P | dist_schedule and linear clauses are not supported | -| target teams distribute parallel loop simd construct | P | device, dist_schedule and linear clauses are not supported | +| teams distribute construct | P | | +| teams distribute simd construct | P | linear clause is not supported | +| target teams distribute construct | P | device clause is not supported | +| teams distribute parallel loop construct | P | | +| target teams distribute parallel loop construct | P | device clause is not supported | +| teams distribute parallel loop simd construct | P | linear clause is not supported | +| target teams distribute parallel loop simd construct | P | device and linear clauses are not supported | ## Extensions ### ATOMIC construct diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td index ade00e7ca27d5..da70048d28c12 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMP.td +++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td @@ -490,7 +490,8 @@ def OMP_SCHEDULE_Dynamic : EnumVal<"dynamic", 3, 1> {} def OMP_SCHEDULE_Guided : EnumVal<"guided", 4, 1> {} def OMP_SCHEDULE_Auto : EnumVal<"auto", 5, 1> {} def OMP_SCHEDULE_Runtime : EnumVal<"runtime", 6, 1> {} -def OMP_SCHEDULE_Default : EnumVal<"default", 7, 0> { let isDefault = 1; } +def OMP_SCHEDULE_Distribute : EnumVal<"distribute", 7, 1> {} +def OMP_SCHEDULE_Default : EnumVal<"default", 8, 0> { let isDefault = 1; } def OMPC_Schedule : Clause<[Spelling<"schedule">]> { let clangClass = "OMPScheduleClause"; let flangClass = "OmpScheduleClause"; @@ -501,6 +502,7 @@ def OMPC_Schedule : Clause<[Spelling<"schedule">]> { OMP_SCHEDULE_Guided, OMP_SCHEDULE_Auto, OMP_SCHEDULE_Runtime, + OMP_SCHEDULE_Distribute, OMP_SCHEDULE_Default ]; } diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h index 7b097d1ac0ee0..b801e212ceced 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -1116,11 +1116,17 @@ class OpenMPIRBuilder { /// \param NeedsBarrier Indicates whether a barrier must be inserted after /// the loop. /// \param LoopType Type of workshare loop. + /// \param HasDistSchedule Defines if the clause being lowered is + /// dist_schedule as this is handled slightly differently + /// \param DistScheduleSchedType Defines the Schedule Type for the Distribute + /// loop. Defaults to None if no Distribute loop is present. /// /// \returns Point where to insert code after the workshare construct. InsertPointOrErrorTy applyStaticWorkshareLoop( DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, - omp::WorksharingLoopType LoopType, bool NeedsBarrier); + omp::WorksharingLoopType LoopType, bool NeedsBarrier, + bool HasDistSchedule = false, + omp::OMPScheduleType DistScheduleSchedType = omp::OMPScheduleType::None); /// Modifies the canonical loop a statically-scheduled workshare loop with a /// user-specified chunk size. @@ -1133,13 +1139,22 @@ class OpenMPIRBuilder { /// \param NeedsBarrier Indicates whether a barrier must be inserted after the /// loop. /// \param ChunkSize The user-specified chunk size. + /// \param SchedType Optional type of scheduling to be passed to the init + /// function. + /// \param DistScheduleChunkSize The size of dist_shcedule chunk considered + /// as a unit when + /// scheduling. If \p nullptr, defaults to 1. + /// \param DistScheduleSchedType Defines the Schedule Type for the Distribute + /// loop. Defaults to None if no Distribute loop is present. /// /// \returns Point where to insert code after the workshare construct. - InsertPointOrErrorTy applyStaticChunkedWorkshareLoop(DebugLoc DL, - CanonicalLoopInfo *CLI, - InsertPointTy AllocaIP, - bool NeedsBarrier, - Value *ChunkSize); + InsertPointOrErrorTy applyStaticChunkedWorkshareLoop( + DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, + bool NeedsBarrier, Value *ChunkSize, + omp::OMPScheduleType SchedType = + omp::OMPScheduleType::UnorderedStaticChunked, + Value *DistScheduleChunkSize = nullptr, + omp::OMPScheduleType DistScheduleSchedType = omp::OMPScheduleType::None); /// Modifies the canonical loop to be a dynamically-scheduled workshare loop. /// @@ -1218,6 +1233,10 @@ class OpenMPIRBuilder { /// \param LoopType Information about type of loop worksharing. /// It corresponds to type of loop workshare OpenMP pragma. /// \param NoLoop If true, no-loop code is generated. + /// \param HasDistSchedule Defines if the clause being lowered is + /// dist_schedule as this is handled slightly differently + /// + /// \param DistScheduleChunkSize The chunk size for dist_schedule loop /// /// \returns Point where to insert code after the workshare construct. LLVM_ABI InsertPointOrErrorTy applyWorkshareLoop( @@ -1229,7 +1248,8 @@ class OpenMPIRBuilder { bool HasOrderedClause = false, omp::WorksharingLoopType LoopType = omp::WorksharingLoopType::ForStaticLoop, - bool NoLoop = false); + bool NoLoop = false, bool HasDistSchedule = false, + Value *DistScheduleChunkSize = nullptr); /// Tile a loop nest. /// diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index c962368859730..cf88c4309974f 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -14,6 +14,7 @@ #include "llvm/Frontend/OpenMP/OMPIRBuilder.h" #include "llvm/ADT/SmallBitVector.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AssumptionCache.h" @@ -136,6 +137,8 @@ static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType) { case OMPScheduleType::NomergeOrderedRuntime: case OMPScheduleType::NomergeOrderedAuto: case OMPScheduleType::NomergeOrderedTrapezoidal: + case OMPScheduleType::OrderedDistributeChunked: + case OMPScheduleType::OrderedDistribute: break; default: return false; @@ -182,7 +185,7 @@ static const omp::GV &getGridValue(const Triple &T, Function *Kernel) { /// arguments. static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, - bool HasSimdModifier) { + bool HasSimdModifier, bool HasDistScheduleChunks) { // Currently, the default schedule it static. switch (ClauseKind) { case OMP_SCHEDULE_Default: @@ -199,6 +202,9 @@ getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, case OMP_SCHEDULE_Runtime: return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd : OMPScheduleType::BaseRuntime; + case OMP_SCHEDULE_Distribute: + return HasDistScheduleChunks ? OMPScheduleType::BaseDistributeChunked + : OMPScheduleType::BaseDistribute; } llvm_unreachable("unhandled schedule clause argument"); } @@ -267,9 +273,10 @@ getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, - bool HasNonmonotonicModifier, bool HasOrderedClause) { - OMPScheduleType BaseSchedule = - getOpenMPBaseScheduleType(ClauseKind, HasChunks, HasSimdModifier); + bool HasNonmonotonicModifier, bool HasOrderedClause, + bool HasDistScheduleChunks) { + OMPScheduleType BaseSchedule = getOpenMPBaseScheduleType( + ClauseKind, HasChunks, HasSimdModifier, HasDistScheduleChunks); OMPScheduleType OrderedSchedule = getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause); OMPScheduleType Result = getOpenMPMonotonicityScheduleType( @@ -4806,7 +4813,8 @@ static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyStaticWorkshareLoop( DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, - WorksharingLoopType LoopType, bool NeedsBarrier) { + WorksharingLoopType LoopType, bool NeedsBarrier, bool HasDistSchedule, + OMPScheduleType DistScheduleSchedType) { assert(CLI->isValid() && "Requires a valid canonical loop"); assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) && "Require dedicated allocate IP"); @@ -4862,15 +4870,29 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyStaticWorkshareLoop( // Call the "init" function and update the trip count of the loop with the // value it produced. - SmallVector Args( - {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound, PUpperBound}); - if (LoopType == WorksharingLoopType::DistributeForStaticLoop) { - Value *PDistUpperBound = - Builder.CreateAlloca(IVTy, nullptr, "p.distupperbound"); - Args.push_back(PDistUpperBound); + auto BuildInitCall = [LoopType, SrcLoc, ThreadNum, PLastIter, PLowerBound, + PUpperBound, IVTy, PStride, One, Zero, StaticInit, + this](Value *SchedulingType, auto &Builder) { + SmallVector Args({SrcLoc, ThreadNum, SchedulingType, PLastIter, + PLowerBound, PUpperBound}); + if (LoopType == WorksharingLoopType::DistributeForStaticLoop) { + Value *PDistUpperBound = + Builder.CreateAlloca(IVTy, nullptr, "p.distupperbound"); + Args.push_back(PDistUpperBound); + } + Args.append({PStride, One, Zero}); + createRuntimeFunctionCall(StaticInit, Args); + }; + BuildInitCall(SchedulingType, Builder); + if (HasDistSchedule && + LoopType != WorksharingLoopType::DistributeStaticLoop) { + Constant *DistScheduleSchedType = ConstantInt::get( + I32Type, static_cast(omp::OMPScheduleType::OrderedDistribute)); + // We want to emit a second init function call for the dist_schedule clause + // to the Distribute construct. This should only be done however if a + // Workshare Loop is nested within a Distribute Construct + BuildInitCall(DistScheduleSchedType, Builder); } - Args.append({PStride, One, Zero}); - createRuntimeFunctionCall(StaticInit, Args); Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound); Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound); Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound); @@ -4909,14 +4931,44 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyStaticWorkshareLoop( return AfterIP; } +static void addAccessGroupMetadata(BasicBlock *Block, MDNode *AccessGroup, + LoopInfo &LI); +static void addLoopMetadata(CanonicalLoopInfo *Loop, + ArrayRef Properties); + +static void applyParallelAccessesMetadata(CanonicalLoopInfo *CLI, + LLVMContext &Ctx, Loop *Loop, + LoopInfo &LoopInfo, + SmallVector &LoopMDList) { + SmallSet Reachable; + + // Get the basic blocks from the loop in which memref instructions + // can be found. + // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo, + // preferably without running any passes. + for (BasicBlock *Block : Loop->getBlocks()) { + if (Block == CLI->getCond() || Block == CLI->getHeader()) + continue; + Reachable.insert(Block); + } + + // Add access group metadata to memory-access instructions. + MDNode *AccessGroup = MDNode::getDistinct(Ctx, {}); + for (BasicBlock *BB : Reachable) + addAccessGroupMetadata(BB, AccessGroup, LoopInfo); + // TODO: If the loop has existing parallel access metadata, have + // to combine two lists. + LoopMDList.push_back(MDNode::get( + Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup})); +} + OpenMPIRBuilder::InsertPointOrErrorTy -OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(DebugLoc DL, - CanonicalLoopInfo *CLI, - InsertPointTy AllocaIP, - bool NeedsBarrier, - Value *ChunkSize) { +OpenMPIRBuilder::applyStaticChunkedWorkshareLoop( + DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, + bool NeedsBarrier, Value *ChunkSize, OMPScheduleType SchedType, + Value *DistScheduleChunkSize, OMPScheduleType DistScheduleSchedType) { assert(CLI->isValid() && "Requires a valid canonical loop"); - assert(ChunkSize && "Chunk size is required"); + assert(ChunkSize || DistScheduleChunkSize && "Chunk size is required"); LLVMContext &Ctx = CLI->getFunction()->getContext(); Value *IV = CLI->getIndVar(); @@ -4930,6 +4982,18 @@ OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(DebugLoc DL, Constant *Zero = ConstantInt::get(InternalIVTy, 0); Constant *One = ConstantInt::get(InternalIVTy, 1); + Function *F = CLI->getFunction(); + FunctionAnalysisManager FAM; + FAM.registerPass([]() { return DominatorTreeAnalysis(); }); + FAM.registerPass([]() { return PassInstrumentationAnalysis(); }); + LoopAnalysis LIA; + LoopInfo &&LI = LIA.run(*F, FAM); + Loop *L = LI.getLoopFor(CLI->getHeader()); + SmallVector LoopMDList; + if (ChunkSize || DistScheduleChunkSize) + applyParallelAccessesMetadata(CLI, Ctx, L, LI, LoopMDList); + addLoopMetadata(CLI, LoopMDList); + // Declare useful OpenMP runtime functions. FunctionCallee StaticInit = getKmpcForStaticInitForType(InternalIVTy, M, *this); @@ -4952,13 +5016,18 @@ OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(DebugLoc DL, Builder.SetCurrentDebugLocation(DL); // TODO: Detect overflow in ubsan or max-out with current tripcount. - Value *CastedChunkSize = - Builder.CreateZExtOrTrunc(ChunkSize, InternalIVTy, "chunksize"); + Value *CastedChunkSize = Builder.CreateZExtOrTrunc( + ChunkSize ? ChunkSize : Zero, InternalIVTy, "chunksize"); + Value *CastedDistScheduleChunkSize = Builder.CreateZExtOrTrunc( + DistScheduleChunkSize ? DistScheduleChunkSize : Zero, InternalIVTy, + "distschedulechunksize"); Value *CastedTripCount = Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount"); - Constant *SchedulingType = ConstantInt::get( - I32Type, static_cast(OMPScheduleType::UnorderedStaticChunked)); + Constant *SchedulingType = + ConstantInt::get(I32Type, static_cast(SchedType)); + Constant *DistSchedulingType = + ConstantInt::get(I32Type, static_cast(DistScheduleSchedType)); Builder.CreateStore(Zero, PLowerBound); Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One); Builder.CreateStore(OrigUpperBound, PUpperBound); @@ -4970,12 +5039,26 @@ OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(DebugLoc DL, Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize); Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize); Value *ThreadNum = getOrCreateThreadID(SrcLoc); - createRuntimeFunctionCall( - StaticInit, {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum, - /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter, - /*plower=*/PLowerBound, /*pupper=*/PUpperBound, - /*pstride=*/PStride, /*incr=*/One, - /*chunk=*/CastedChunkSize}); + auto BuildInitCall = [StaticInit, SrcLoc, ThreadNum, PLastIter, PLowerBound, + PUpperBound, PStride, One, + this](Value *SchedulingType, Value *ChunkSize, + auto &Builder) { + createRuntimeFunctionCall( + StaticInit, {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum, + /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter, + /*plower=*/PLowerBound, /*pupper=*/PUpperBound, + /*pstride=*/PStride, /*incr=*/One, + /*chunk=*/ChunkSize}); + }; + BuildInitCall(SchedulingType, CastedChunkSize, Builder); + if (DistScheduleSchedType != OMPScheduleType::None && + SchedType != OMPScheduleType::OrderedDistributeChunked && + SchedType != OMPScheduleType::OrderedDistribute) { + // We want to emit a second init function call for the dist_schedule clause + // to the Distribute construct. This should only be done however if a + // Workshare Loop is nested within a Distribute Construct + BuildInitCall(DistSchedulingType, CastedDistScheduleChunkSize, Builder); + } // Load values written by the "init" function. Value *FirstChunkStart = @@ -5302,31 +5385,47 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyWorkshareLoop( bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause, - WorksharingLoopType LoopType, bool NoLoop) { + WorksharingLoopType LoopType, bool NoLoop, bool HasDistSchedule, + Value *DistScheduleChunkSize) { if (Config.isTargetDevice()) return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType, NoLoop); OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType( SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier, - HasNonmonotonicModifier, HasOrderedClause); + HasNonmonotonicModifier, HasOrderedClause, DistScheduleChunkSize); bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) == OMPScheduleType::ModifierOrdered; + OMPScheduleType DistScheduleSchedType = OMPScheduleType::None; + if (HasDistSchedule) { + DistScheduleSchedType = DistScheduleChunkSize + ? OMPScheduleType::OrderedDistributeChunked + : OMPScheduleType::OrderedDistribute; + } switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) { case OMPScheduleType::BaseStatic: - assert(!ChunkSize && "No chunk size with static-chunked schedule"); - if (IsOrdered) + case OMPScheduleType::BaseDistribute: + assert(!ChunkSize || !DistScheduleChunkSize && + "No chunk size with static-chunked schedule"); + if (IsOrdered && !HasDistSchedule) return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType, NeedsBarrier, ChunkSize); // FIXME: Monotonicity ignored? - return applyStaticWorkshareLoop(DL, CLI, AllocaIP, LoopType, NeedsBarrier); + if (DistScheduleChunkSize) + return applyStaticChunkedWorkshareLoop( + DL, CLI, AllocaIP, NeedsBarrier, ChunkSize, EffectiveScheduleType, + DistScheduleChunkSize, DistScheduleSchedType); + return applyStaticWorkshareLoop(DL, CLI, AllocaIP, LoopType, NeedsBarrier, + HasDistSchedule); case OMPScheduleType::BaseStaticChunked: - if (IsOrdered) + case OMPScheduleType::BaseDistributeChunked: + if (IsOrdered && !HasDistSchedule) return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType, NeedsBarrier, ChunkSize); // FIXME: Monotonicity ignored? - return applyStaticChunkedWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier, - ChunkSize); + return applyStaticChunkedWorkshareLoop( + DL, CLI, AllocaIP, NeedsBarrier, ChunkSize, EffectiveScheduleType, + DistScheduleChunkSize, DistScheduleSchedType); case OMPScheduleType::BaseRuntime: case OMPScheduleType::BaseAuto: @@ -5920,8 +6019,8 @@ static void addLoopMetadata(CanonicalLoopInfo *Loop, } /// Attach llvm.access.group metadata to the memref instructions of \p Block -static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup, - LoopInfo &LI) { +static void addAccessGroupMetadata(BasicBlock *Block, MDNode *AccessGroup, + LoopInfo &LI) { for (Instruction &I : *Block) { if (I.mayReadOrWriteMemory()) { // TODO: This instruction may already have access group from @@ -6111,16 +6210,8 @@ void OpenMPIRBuilder::applySimd(CanonicalLoopInfo *CanonicalLoop, // dependences of 'safelen' iterations are possible. // If clause order(concurrent) is specified then the memory instructions // are marked parallel even if 'safelen' is finite. - if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent)) { - // Add access group metadata to memory-access instructions. - MDNode *AccessGroup = MDNode::getDistinct(Ctx, {}); - for (BasicBlock *BB : Reachable) - addSimdMetadata(BB, AccessGroup, LI); - // TODO: If the loop has existing parallel access metadata, have - // to combine two lists. - LoopMDList.push_back(MDNode::get( - Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup})); - } + if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent)) + applyParallelAccessesMetadata(CanonicalLoop, Ctx, L, LI, LoopMDList); // FIXME: the IF clause shares a loop backedge for the SIMD and non-SIMD // versions so we can't add the loop attributes in that case. diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp index 1b069c62a8be9..0d6b2870c625a 100644 --- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp +++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp @@ -617,6 +617,7 @@ parseScheduleClause(OpAsmParser &parser, ClauseScheduleKindAttr &scheduleAttr, break; case ClauseScheduleKind::Auto: case ClauseScheduleKind::Runtime: + case ClauseScheduleKind::Distribute: chunkSize = std::nullopt; } diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 0378033b55b38..0d5b553c8e652 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -61,6 +61,8 @@ convertToScheduleKind(std::optional schedKind) { return llvm::omp::OMP_SCHEDULE_Auto; case omp::ClauseScheduleKind::Runtime: return llvm::omp::OMP_SCHEDULE_Runtime; + case omp::ClauseScheduleKind::Distribute: + return llvm::omp::OMP_SCHEDULE_Distribute; } llvm_unreachable("unhandled schedule clause argument"); } @@ -319,10 +321,6 @@ static LogicalResult checkImplementationStatus(Operation &op) { if (op.getDevice()) result = todo("device"); }; - auto checkDistSchedule = [&todo](auto op, LogicalResult &result) { - if (op.getDistScheduleChunkSize()) - result = todo("dist_schedule with chunk_size"); - }; auto checkHint = [](auto op, LogicalResult &) { if (op.getHint()) op.emitWarning("hint clause discarded"); @@ -387,7 +385,6 @@ static LogicalResult checkImplementationStatus(Operation &op) { }) .Case([&](omp::DistributeOp op) { checkAllocate(op, result); - checkDistSchedule(op, result); checkOrder(op, result); }) .Case([&](omp::OrderedRegionOp op) { checkParLevelSimd(op, result); }) @@ -2548,6 +2545,19 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder, chunk = builder.CreateSExtOrTrunc(chunkVar, ivType); } + omp::DistributeOp distributeOp = nullptr; + llvm::Value *distScheduleChunk = nullptr; + bool hasDistSchedule = false; + if (llvm::isa_and_present(opInst.getParentOp())) { + distributeOp = cast(opInst.getParentOp()); + hasDistSchedule = distributeOp.getDistScheduleStatic(); + if (distributeOp.getDistScheduleChunkSize()) { + llvm::Value *chunkVar = moduleTranslation.lookupValue( + distributeOp.getDistScheduleChunkSize()); + distScheduleChunk = builder.CreateSExtOrTrunc(chunkVar, ivType); + } + } + PrivateVarsInfo privateVarsInfo(wsloopOp); SmallVector reductionDecls; @@ -2675,7 +2685,7 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder, convertToScheduleKind(schedule), chunk, isSimd, scheduleMod == omp::ScheduleModifier::monotonic, scheduleMod == omp::ScheduleModifier::nonmonotonic, isOrdered, - workshareLoopType, noLoopMode); + workshareLoopType, noLoopMode, hasDistSchedule, distScheduleChunk); if (failed(handleError(wsloopIP, opInst))) return failure(); @@ -5254,15 +5264,18 @@ convertOmpDistribute(Operation &opInst, llvm::IRBuilderBase &builder, if (!isa_and_present(distributeOp.getNestedWrapper())) { // TODO: Add support for clauses which are valid for DISTRIBUTE // constructs. Static schedule is the default. - auto schedule = omp::ClauseScheduleKind::Static; - bool isOrdered = false; + bool hasDistSchedule = distributeOp.getDistScheduleStatic(); + auto schedule = hasDistSchedule ? omp::ClauseScheduleKind::Distribute + : omp::ClauseScheduleKind::Static; + // dist_schedule clauses are ordered - otherise this should be false + bool isOrdered = hasDistSchedule; std::optional scheduleMod; bool isSimd = false; llvm::omp::WorksharingLoopType workshareLoopType = llvm::omp::WorksharingLoopType::DistributeStaticLoop; bool loopNeedsBarrier = false; - llvm::Value *chunk = nullptr; - + llvm::Value *chunk = moduleTranslation.lookupValue( + distributeOp.getDistScheduleChunkSize()); llvm::CanonicalLoopInfo *loopInfo = findCurrentLoopInfo(moduleTranslation); llvm::OpenMPIRBuilder::InsertPointOrErrorTy wsloopIP = @@ -5271,12 +5284,11 @@ convertOmpDistribute(Operation &opInst, llvm::IRBuilderBase &builder, convertToScheduleKind(schedule), chunk, isSimd, scheduleMod == omp::ScheduleModifier::monotonic, scheduleMod == omp::ScheduleModifier::nonmonotonic, isOrdered, - workshareLoopType); + workshareLoopType, false, hasDistSchedule, chunk); if (!wsloopIP) return wsloopIP.takeError(); } - if (failed(cleanupPrivateVars(builder, moduleTranslation, distributeOp.getLoc(), privVarsInfo.llvmVars, privVarsInfo.privatizers))) diff --git a/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir b/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir index f2fbe91a41ecd..b122f425f0752 100644 --- a/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir +++ b/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir @@ -615,3 +615,22 @@ omp.declare_mapper @my_mapper : !llvm.struct<"_QFdeclare_mapperTmy_type", (i32)> // CHECK: omp.declare_mapper.info map_entries(%{{.*}}, %{{.*}} : !llvm.ptr, !llvm.ptr) omp.declare_mapper.info map_entries(%3, %2 : !llvm.ptr, !llvm.ptr) } + +// CHECK-LABEL: llvm.func @omp_dist_schedule(%arg0: i32) { +func.func @omp_dist_schedule(%arg0: i32) { + %c1_i32 = arith.constant 1 : i32 + // CHECK: %1 = llvm.mlir.constant(1024 : i32) : i32 + %c1024_i32 = arith.constant 1024 : i32 + %c16_i32 = arith.constant 16 : i32 + %c8_i32 = arith.constant 8 : i32 + omp.teams num_teams( to %c8_i32 : i32) thread_limit(%c16_i32 : i32) { + // CHECK: omp.distribute dist_schedule_static dist_schedule_chunk_size(%1 : i32) { + omp.distribute dist_schedule_static dist_schedule_chunk_size(%c1024_i32 : i32) { + omp.loop_nest (%arg1) : i32 = (%c1_i32) to (%arg0) inclusive step (%c1_i32) { + omp.terminator + } + } + omp.terminator + } + return +} diff --git a/mlir/test/Target/LLVMIR/openmp-dist_schedule.mlir b/mlir/test/Target/LLVMIR/openmp-dist_schedule.mlir new file mode 100644 index 0000000000000..a0dd5560a15c4 --- /dev/null +++ b/mlir/test/Target/LLVMIR/openmp-dist_schedule.mlir @@ -0,0 +1,34 @@ +// Test that dist_schedule gets correctly translated with the correct schedule type and chunk size where appropriate + +// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s + +llvm.func @distribute_dist_schedule_chunk_size(%lb : i32, %ub : i32, %step : i32, %x : i32) { + // CHECK: call void @[[RUNTIME_FUNC:__kmpc_for_static_init_4u]](ptr @1, i32 %omp_global_thread_num, i32 91, ptr %p.lastiter, ptr %p.lowerbound, ptr %p.upperbound, ptr %p.stride, i32 1, i32 1024) + // We want to make sure that the next call is not another init builder. + // CHECK-NOT: call void @[[RUNTIME_FUNC]] + %1 = llvm.mlir.constant(1024: i32) : i32 + omp.distribute dist_schedule_static dist_schedule_chunk_size(%1 : i32) { + omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) { + omp.yield + } + } + llvm.return +} + +// When a chunk size is present, we need to make sure the correct parallel accesses metadata is added +// CHECK: !2 = !{!"llvm.loop.parallel_accesses", !3} +// CHECK-NEXT: !3 = distinct !{} + +// ----- + +llvm.func @distribute_dist_schedule(%lb : i32, %ub : i32, %step : i32, %x : i32) { + // CHECK: call void @[[RUNTIME_FUNC:__kmpc_for_static_init_4u]](ptr @1, i32 %omp_global_thread_num, i32 92, ptr %p.lastiter, ptr %p.lowerbound, ptr %p.upperbound, ptr %p.stride, i32 1, i32 0) + // We want to make sure that the next call is not another init builder. + // CHECK-NOT: call void @[[RUNTIME_FUNC]] + omp.distribute dist_schedule_static { + omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) { + omp.yield + } + } + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/openmp-dist_schedule_with_wsloop.mlir b/mlir/test/Target/LLVMIR/openmp-dist_schedule_with_wsloop.mlir new file mode 100644 index 0000000000000..dad32b48e5419 --- /dev/null +++ b/mlir/test/Target/LLVMIR/openmp-dist_schedule_with_wsloop.mlir @@ -0,0 +1,205 @@ +// Test that dist_schedule gets correctly translated with the correct schedule type and chunk size where appropriate while using workshare loops. + +// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s + +llvm.func @distribute_wsloop_dist_schedule_chunked_schedule_chunked(%n: i32, %teams: i32, %threads: i32, %dcs: i32) { + %0 = llvm.mlir.constant(0 : i32) : i32 + %1 = llvm.mlir.constant(1 : i32) : i32 + %scs = llvm.mlir.constant(64 : i32) : i32 + + omp.teams num_teams(to %teams : i32) thread_limit(%threads : i32) { + omp.parallel { + omp.distribute dist_schedule_static dist_schedule_chunk_size(%dcs : i32) { + omp.wsloop schedule(static = %scs : i32) { + omp.loop_nest (%i) : i32 = (%0) to (%n) step (%1) { + omp.yield + } + } {omp.composite} + } {omp.composite} + omp.terminator + } {omp.composite} + omp.terminator + } + llvm.return +} +// CHECK: define internal void @distribute_wsloop_dist_schedule_chunked_schedule_chunked..omp_par(ptr noalias %tid.addr, ptr noalias %zero.addr, ptr %0) #0 { +// CHECK: call void @__kmpc_for_static_init_4u(ptr @1, i32 %omp_global_thread_num9, i32 33, ptr %p.lastiter, ptr %p.lowerbound, ptr %p.upperbound, ptr %p.stride, i32 1, i32 64) +// CHECK: call void @__kmpc_for_static_init_4u(ptr @1, i32 %omp_global_thread_num9, i32 91, ptr %p.lastiter, ptr %p.lowerbound, ptr %p.upperbound, ptr %p.stride, i32 1, i32 %3) + +llvm.func @distribute_wsloop_dist_schedule_chunked_schedule_chunked_i64(%n: i32, %teams: i32, %threads: i32) { + %0 = llvm.mlir.constant(0 : i64) : i64 + %1 = llvm.mlir.constant(1 : i64) : i64 + %dcs = llvm.mlir.constant(1024 : i64) : i64 + %scs = llvm.mlir.constant(64 : i64) : i64 + %n64 = llvm.zext %n : i32 to i64 + + omp.teams num_teams(to %teams : i32) thread_limit(%threads : i32) { + omp.parallel { + omp.distribute dist_schedule_static dist_schedule_chunk_size(%dcs : i64) { + omp.wsloop schedule(static = %scs : i64) { + omp.loop_nest (%i) : i64 = (%0) to (%n64) step (%1) { + omp.yield + } + } {omp.composite} + } {omp.composite} + omp.terminator + } {omp.composite} + omp.terminator + } + llvm.return +} +// CHECK: define internal void @distribute_wsloop_dist_schedule_chunked_schedule_chunked_i64..omp_par(ptr noalias %tid.addr, ptr noalias %zero.addr, ptr %0) #0 { +// CHECK: call void @__kmpc_for_static_init_8u(ptr @1, i32 %omp_global_thread_num9, i32 33, ptr %p.lastiter, ptr %p.lowerbound, ptr %p.upperbound, ptr %p.stride, i64 1, i64 64) +// call void @__kmpc_for_static_init_8u(ptr @1, i32 %omp_global_thread_num9, i32 91, ptr %p.lastiter, ptr %p.lowerbound, ptr %p.upperbound, ptr %p.stride, i64 1, i64 1024) + +// ----- + +llvm.func @distribute_wsloop_dist_schedule_chunked(%n: i32, %teams: i32, %threads: i32) { + %0 = llvm.mlir.constant(0 : i32) : i32 + %1 = llvm.mlir.constant(1 : i32) : i32 + %dcs = llvm.mlir.constant(1024 : i32) : i32 + + omp.teams num_teams(to %teams : i32) thread_limit(%threads : i32) { + omp.parallel { + omp.distribute dist_schedule_static dist_schedule_chunk_size(%dcs : i32) { + omp.wsloop schedule(static) { + omp.loop_nest (%i) : i32 = (%0) to (%n) step (%1) { + omp.yield + } + } {omp.composite} + } {omp.composite} + omp.terminator + } {omp.composite} + omp.terminator + } + llvm.return +} +// CHECK: define internal void @distribute_wsloop_dist_schedule_chunked..omp_par(ptr noalias %tid.addr, ptr noalias %zero.addr, ptr %0) #0 { +// CHECK: call void @__kmpc_for_static_init_4u(ptr @1, i32 %omp_global_thread_num9, i32 34, ptr %p.lastiter, ptr %p.lowerbound, ptr %p.upperbound, ptr %p.stride, i32 1, i32 0) +// CHECK: call void @__kmpc_for_static_init_4u(ptr @1, i32 %omp_global_thread_num9, i32 91, ptr %p.lastiter, ptr %p.lowerbound, ptr %p.upperbound, ptr %p.stride, i32 1, i32 1024) + +llvm.func @distribute_wsloop_dist_schedule_chunked_i64(%n: i32, %teams: i32, %threads: i32) { + %0 = llvm.mlir.constant(0 : i64) : i64 + %1 = llvm.mlir.constant(1 : i64) : i64 + %dcs = llvm.mlir.constant(1024 : i64) : i64 + %n64 = llvm.zext %n : i32 to i64 + + omp.teams num_teams(to %teams : i32) thread_limit(%threads : i32) { + omp.parallel { + omp.distribute dist_schedule_static dist_schedule_chunk_size(%dcs : i64) { + omp.wsloop schedule(static) { + omp.loop_nest (%i) : i64 = (%0) to (%n64) step (%1) { + omp.yield + } + } {omp.composite} + } {omp.composite} + omp.terminator + } {omp.composite} + omp.terminator + } + llvm.return +} +// CHECK: define internal void @distribute_wsloop_dist_schedule_chunked_i64..omp_par(ptr noalias %tid.addr, ptr noalias %zero.addr, ptr %0) #0 { +// CHECK: call void @__kmpc_for_static_init_8u(ptr @1, i32 %omp_global_thread_num9, i32 34, ptr %p.lastiter, ptr %p.lowerbound, ptr %p.upperbound, ptr %p.stride, i64 1, i64 0) +// CHECK: call void @__kmpc_for_static_init_8u(ptr @1, i32 %omp_global_thread_num9, i32 91, ptr %p.lastiter, ptr %p.lowerbound, ptr %p.upperbound, ptr %p.stride, i64 1, i64 1024) + +// ----- + +llvm.func @distribute_wsloop_schedule_chunked(%n: i32, %teams: i32, %threads: i32) { + %0 = llvm.mlir.constant(0 : i32) : i32 + %1 = llvm.mlir.constant(1 : i32) : i32 + %scs = llvm.mlir.constant(64 : i32) : i32 + + omp.teams num_teams(to %teams : i32) thread_limit(%threads : i32) { + omp.parallel { + omp.distribute dist_schedule_static { + omp.wsloop schedule(static = %scs : i32) { + omp.loop_nest (%i) : i32 = (%0) to (%n) step (%1) { + omp.yield + } + } {omp.composite} + } {omp.composite} + omp.terminator + } {omp.composite} + omp.terminator + } + llvm.return +} +// CHECK: define internal void @distribute_wsloop_schedule_chunked..omp_par(ptr noalias %tid.addr, ptr noalias %zero.addr, ptr %0) #0 { +// CHECK: call void @__kmpc_for_static_init_4u(ptr @1, i32 %omp_global_thread_num9, i32 33, ptr %p.lastiter, ptr %p.lowerbound, ptr %p.upperbound, ptr %p.stride, i32 1, i32 64) +// CHECK: call void @__kmpc_for_static_init_4u(ptr @1, i32 %omp_global_thread_num9, i32 92, ptr %p.lastiter, ptr %p.lowerbound, ptr %p.upperbound, ptr %p.stride, i32 1, i32 0) + +llvm.func @distribute_wsloop_schedule_chunked_i64(%n: i32, %teams: i32, %threads: i32) { + %0 = llvm.mlir.constant(0 : i64) : i64 + %1 = llvm.mlir.constant(1 : i64) : i64 + %scs = llvm.mlir.constant(64 : i64) : i64 + %n64 = llvm.zext %n : i32 to i64 + + omp.teams num_teams(to %teams : i32) thread_limit(%threads : i32) { + omp.parallel { + omp.distribute dist_schedule_static { + omp.wsloop schedule(static = %scs : i64) { + omp.loop_nest (%i) : i64 = (%0) to (%n64) step (%1) { + omp.yield + } + } {omp.composite} + } {omp.composite} + omp.terminator + } {omp.composite} + omp.terminator + } + llvm.return +} + +// CHECK: define internal void @distribute_wsloop_schedule_chunked_i64..omp_par(ptr noalias %tid.addr, ptr noalias %zero.addr, ptr %0) #0 { +// CHECK: call void @__kmpc_for_static_init_8u(ptr @1, i32 %omp_global_thread_num9, i32 33, ptr %p.lastiter, ptr %p.lowerbound, ptr %p.upperbound, ptr %p.stride, i64 1, i64 64) +// CHECK: call void @__kmpc_for_static_init_8u(ptr @1, i32 %omp_global_thread_num9, i32 92, ptr %p.lastiter, ptr %p.lowerbound, ptr %p.upperbound, ptr %p.stride, i64 1, i64 0) + +// ----- + +llvm.func @distribute_wsloop_no_chunks(%n: i32, %teams: i32, %threads: i32) { + %0 = llvm.mlir.constant(0 : i32) : i32 + %1 = llvm.mlir.constant(1 : i32) : i32 + + omp.teams num_teams(to %teams : i32) thread_limit(%threads : i32) { + omp.parallel { + omp.distribute dist_schedule_static { + omp.wsloop schedule(static) { + omp.loop_nest (%i) : i32 = (%0) to (%n) step (%1) { + omp.yield + } + } {omp.composite} + } {omp.composite} + omp.terminator + } {omp.composite} + omp.terminator + } + llvm.return +} +// CHECK: define internal void @distribute_wsloop_no_chunks..omp_par(ptr noalias %tid.addr, ptr noalias %zero.addr, ptr %0) #0 { +// CHECK: call void @__kmpc_dist_for_static_init_4u(ptr @1, i32 %omp_global_thread_num9, i32 34, ptr %p.lastiter, ptr %p.lowerbound, ptr %p.upperbound, ptr %p.distupperbound, ptr %p.stride, i32 1, i32 0) +// CHECK: call void @__kmpc_dist_for_static_init_4u(ptr @1, i32 %omp_global_thread_num9, i32 92, ptr %p.lastiter, ptr %p.lowerbound, ptr %p.upperbound, ptr %p.distupperbound10, ptr %p.stride, i32 1, i32 0) + +llvm.func @distribute_wsloop_no_chunks_i64(%n: i32, %teams: i32, %threads: i32) { + %0 = llvm.mlir.constant(0 : i64) : i64 + %1 = llvm.mlir.constant(1 : i64) : i64 + %n64 = llvm.zext %n : i32 to i64 + + omp.teams num_teams(to %teams : i32) thread_limit(%threads : i32) { + omp.parallel { + omp.distribute dist_schedule_static { + omp.wsloop schedule(static) { + omp.loop_nest (%i) : i64 = (%0) to (%n64) step (%1) { + omp.yield + } + } {omp.composite} + } {omp.composite} + omp.terminator + } {omp.composite} + omp.terminator + } + llvm.return +} +// CHECK: define internal void @distribute_wsloop_no_chunks_i64..omp_par(ptr noalias %tid.addr, ptr noalias %zero.addr, ptr %0) #0 { +// CHECK: call void @__kmpc_dist_for_static_init_8u(ptr @1, i32 %omp_global_thread_num9, i32 34, ptr %p.lastiter, ptr %p.lowerbound, ptr %p.upperbound, ptr %p.distupperbound, ptr %p.stride, i64 1, i64 0) +// CHECK: call void @__kmpc_dist_for_static_init_8u(ptr @1, i32 %omp_global_thread_num9, i32 92, ptr %p.lastiter, ptr %p.lowerbound, ptr %p.upperbound, ptr %p.distupperbound10, ptr %p.stride, i64 1, i64 0) \ No newline at end of file diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir index af6d254cfd3c3..731a6322736d4 100644 --- a/mlir/test/Target/LLVMIR/openmp-todo.mlir +++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir @@ -39,19 +39,6 @@ llvm.func @distribute_allocate(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr // ----- -llvm.func @distribute_dist_schedule(%lb : i32, %ub : i32, %step : i32, %x : i32) { - // expected-error@below {{not yet implemented: Unhandled clause dist_schedule with chunk_size in omp.distribute operation}} - // expected-error@below {{LLVM Translation failed for operation: omp.distribute}} - omp.distribute dist_schedule_static dist_schedule_chunk_size(%x : i32) { - omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) { - omp.yield - } - } - llvm.return -} - -// ----- - llvm.func @distribute_order(%lb : i32, %ub : i32, %step : i32) { // expected-error@below {{not yet implemented: Unhandled clause order in omp.distribute operation}} // expected-error@below {{LLVM Translation failed for operation: omp.distribute}}