Skip to content

Commit c05828f

Browse files
authored
[OpenMP] Fix num_iters in __kmpc_*_loop DeviceRTL functions (llvm#1385)
2 parents 04e49d8 + 367f351 commit c05828f

File tree

6 files changed

+10
-23
lines changed

6 files changed

+10
-23
lines changed

llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -4523,23 +4523,10 @@ getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder,
45234523
static void createTargetLoopWorkshareCall(
45244524
OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType,
45254525
BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg,
4526-
Type *ParallelTaskPtr, Value *TripCountOrig, Function &LoopBodyFn) {
4526+
Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn) {
4527+
Type *TripCountTy = TripCount->getType();
45274528
Module &M = OMPBuilder->M;
45284529
IRBuilder<> &Builder = OMPBuilder->Builder;
4529-
Value *TripCount = TripCountOrig;
4530-
// FIXME(JAN): The trip count is 1 larger than it should be for GPU, this may
4531-
// not be the right way to fix it, but this works for now.
4532-
if (OMPBuilder->Config.isGPU()) {
4533-
Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
4534-
LLVMContext &Ctx = M.getContext();
4535-
Type *IVTy = TripCountOrig->getType();
4536-
Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32
4537-
? Type::getInt32Ty(Ctx)
4538-
: Type::getInt64Ty(Ctx);
4539-
Constant *One = ConstantInt::get(InternalIVTy, 1);
4540-
TripCount = Builder.CreateSub(TripCountOrig, One, "modified_trip_count");
4541-
}
4542-
Type *TripCountTy = TripCount->getType();
45434530
FunctionCallee RTLFn =
45444531
getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
45454532
SmallVector<Value *, 8> RealArgs;

llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2426,7 +2426,7 @@ TEST_F(OpenMPIRBuilderTest, StaticWorkshareLoopTarget) {
24262426
ConstantInt *WorkshareLoopRuntimeCallTripCount =
24272427
dyn_cast<ConstantInt>(WorkshareLoopRuntimeCall->getArgOperand(3));
24282428
EXPECT_NE(WorkshareLoopRuntimeCallTripCount, nullptr);
2429-
EXPECT_EQ(WorkshareLoopRuntimeCallTripCount->getSExtValue() + 1,
2429+
EXPECT_EQ(WorkshareLoopRuntimeCallTripCount->getSExtValue(),
24302430
TripCountConstInt->getSExtValue());
24312431
}
24322432

mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
3737
// CHECK-SAME: ptr %[[ARG_PTR:.*]])
3838
// CHECK-SAME: #[[ATTRS1:[0-9]+]]
3939
// CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB]] to ptr),
40-
// CHECK-SAME: ptr @[[LOOP_BODY_FUNC:.*]], ptr %[[LOO_BODY_FUNC_ARG:.*]], i32 9,
40+
// CHECK-SAME: ptr @[[LOOP_BODY_FUNC:.*]], ptr %[[LOO_BODY_FUNC_ARG:.*]], i32 10,
4141
// CHECK-SAME: i32 %[[THREAD_NUM:.*]], i32 0)
4242

4343
// CHECK: define internal void @[[LOOP_BODY_FUNC]](i32 %[[CNT:.*]], ptr %[[LOOP_BODY_ARG_PTR:.*]]) #[[ATTRS2:[0-9]+]] {

mlir/test/Target/LLVMIR/omptarget-wsloop-collapsed.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
2626

2727
// CHECK: define void @[[FUNC_COLLAPSED_WSLOOP:.*]](ptr %[[ARG0:.*]])
2828
// CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr),
29-
// CHECK-SAME: ptr @[[COLLAPSED_WSLOOP_BODY_FN:.*]], ptr %[[STRUCT_ARG:.*]], i32 9999,
29+
// CHECK-SAME: ptr @[[COLLAPSED_WSLOOP_BODY_FN:.*]], ptr %[[STRUCT_ARG:.*]], i32 10000,
3030
// CHECK-SAME: i32 %[[NUM_THREADS:.*]], i32 0)
3131

3232
// CHECK: define internal void @[[COLLAPSED_WSLOOP_BODY_FN]](i32 %[[LOOP_CNT:.*]], ptr %[[LOOP_BODY_ARG:.*]])

mlir/test/Target/LLVMIR/omptarget-wsloop.mlir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
4545
// CHECK: %[[GEP:.*]] = getelementptr { ptr }, ptr %[[STRUCTARG_ASCAST]], i32 0, i32 0
4646
// CHECK: store ptr %[[LOAD]], ptr %[[GEP]], align 8
4747
// CHECK: %[[NUM_THREADS:.*]] = call i32 @omp_get_num_threads()
48-
// CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr), ptr @[[LOOP_BODY_FN:.*]], ptr %[[STRUCTARG_ASCAST]], i32 9, i32 %[[NUM_THREADS]], i32 0)
48+
// CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr), ptr @[[LOOP_BODY_FN:.*]], ptr %[[STRUCTARG_ASCAST]], i32 10, i32 %[[NUM_THREADS]], i32 0)
4949

5050
// CHECK: define internal void @[[LOOP_BODY_FN]](i32 %[[LOOP_CNT:.*]], ptr %[[LOOP_BODY_ARG:.*]])
5151
// CHECK: %[[GEP2:.*]] = getelementptr { ptr }, ptr %[[LOOP_BODY_ARG]], i32 0, i32 0
@@ -54,6 +54,6 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
5454
// CHECK: store i32 %[[VAL0:.*]], ptr %[[GEP3]], align 4
5555

5656
// CHECK: define void @[[FUNC_EMPTY_WSLOOP:.*]]()
57-
// CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr), ptr @[[LOOP_EMPTY_BODY_FN:.*]], ptr null, i32 9, i32 %[[NUM_THREADS:.*]], i32 0)
57+
// CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr), ptr @[[LOOP_EMPTY_BODY_FN:.*]], ptr null, i32 10, i32 %[[NUM_THREADS:.*]], i32 0)
5858

5959
// CHECK: define internal void @[[LOOP_EMPTY_BODY_FN]](i32 %[[LOOP_CNT:.*]])

offload/DeviceRTL/src/Workshare.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -969,19 +969,19 @@ template <typename Ty> class StaticLoopChunker {
969969
IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \
970970
TY num_threads, TY block_chunk, TY thread_chunk) { \
971971
ompx::StaticLoopChunker<TY>::DistributeFor( \
972-
loc, fn, arg, num_iters + 1, num_threads, block_chunk, thread_chunk); \
972+
loc, fn, arg, num_iters, num_threads, block_chunk, thread_chunk); \
973973
} \
974974
[[gnu::flatten, clang::always_inline]] void \
975975
__kmpc_distribute_static_loop##BW(IdentTy *loc, void (*fn)(TY, void *), \
976976
void *arg, TY num_iters, \
977977
TY block_chunk) { \
978-
ompx::StaticLoopChunker<TY>::Distribute(loc, fn, arg, num_iters + 1, \
978+
ompx::StaticLoopChunker<TY>::Distribute(loc, fn, arg, num_iters, \
979979
block_chunk); \
980980
} \
981981
[[gnu::flatten, clang::always_inline]] void __kmpc_for_static_loop##BW( \
982982
IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \
983983
TY num_threads, TY thread_chunk) { \
984-
ompx::StaticLoopChunker<TY>::For(loc, fn, arg, num_iters + 1, num_threads, \
984+
ompx::StaticLoopChunker<TY>::For(loc, fn, arg, num_iters, num_threads, \
985985
thread_chunk); \
986986
}
987987

0 commit comments

Comments
 (0)