Skip to content

Commit 8ccb8f8

Browse files
committed
[OPENMP][NVPTX]Improve code by using parallel level counter.
Summary: Previously for the different purposes we need to get the active/common parallel level and with full runtime we iterated over all the records to calculate this level. Instead, we can used the warp-based parallel level counters used in no-runtime mode. Reviewers: grokos, gtbercea, kkwli0 Subscribers: guansong, jfb, jdoerfert, caomhin, openmp-commits Tags: #openmp Differential Revision: https://reviews.llvm.org/D61395 llvm-svn: 359822
1 parent 88a0f13 commit 8ccb8f8

File tree

10 files changed

+200
-197
lines changed

10 files changed

+200
-197
lines changed

openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu

Lines changed: 5 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,7 @@ EXTERN void omp_set_num_threads(int num) {
4747
EXTERN int omp_get_num_threads(void) {
4848
bool isSPMDExecutionMode = isSPMDMode();
4949
int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
50-
int rc =
51-
GetNumberOfOmpThreads(tid, isSPMDExecutionMode, isRuntimeUninitialized());
50+
int rc = GetNumberOfOmpThreads(tid, isSPMDExecutionMode);
5251
PRINT(LD_IO, "call omp_get_num_threads() return %d\n", rc);
5352
return rc;
5453
}
@@ -83,7 +82,7 @@ EXTERN int omp_get_thread_limit(void) {
8382
EXTERN int omp_get_thread_num() {
8483
bool isSPMDExecutionMode = isSPMDMode();
8584
int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
86-
int rc = GetOmpThreadId(tid, isSPMDExecutionMode, isRuntimeUninitialized());
85+
int rc = GetOmpThreadId(tid, isSPMDExecutionMode);
8786
PRINT(LD_IO, "call omp_get_thread_num() returns %d\n", rc);
8887
return rc;
8988
}
@@ -95,18 +94,7 @@ EXTERN int omp_get_num_procs(void) {
9594
}
9695

9796
EXTERN int omp_in_parallel(void) {
98-
int rc = 0;
99-
if (isRuntimeUninitialized()) {
100-
ASSERT0(LT_FUSSY, isSPMDMode(),
101-
"Expected SPMD mode only with uninitialized runtime.");
102-
rc = 1; // SPMD mode is always in parallel.
103-
} else {
104-
omptarget_nvptx_TaskDescr *currTaskDescr =
105-
getMyTopTaskDescriptor(isSPMDMode());
106-
if (currTaskDescr->InParallelRegion()) {
107-
rc = 1;
108-
}
109-
}
97+
int rc = parallelLevel[GetWarpId()] > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0;
11098
PRINT(LD_IO, "call omp_in_parallel() returns %d\n", rc);
11199
return rc;
112100
}
@@ -155,46 +143,13 @@ EXTERN int omp_get_max_active_levels(void) {
155143
}
156144

157145
EXTERN int omp_get_level(void) {
158-
if (isRuntimeUninitialized()) {
159-
ASSERT0(LT_FUSSY, isSPMDMode(),
160-
"Expected SPMD mode only with uninitialized runtime.");
161-
// parallelLevel starts from 0, need to add 1 for correct level.
162-
return parallelLevel[GetWarpId()] + 1;
163-
}
164-
int level = 0;
165-
omptarget_nvptx_TaskDescr *currTaskDescr =
166-
getMyTopTaskDescriptor(isSPMDMode());
167-
ASSERT0(LT_FUSSY, currTaskDescr,
168-
"do not expect fct to be called in a non-active thread");
169-
do {
170-
if (currTaskDescr->IsParallelConstruct()) {
171-
level++;
172-
}
173-
currTaskDescr = currTaskDescr->GetPrevTaskDescr();
174-
} while (currTaskDescr);
146+
int level = parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1);
175147
PRINT(LD_IO, "call omp_get_level() returns %d\n", level);
176148
return level;
177149
}
178150

179151
EXTERN int omp_get_active_level(void) {
180-
if (isRuntimeUninitialized()) {
181-
ASSERT0(LT_FUSSY, isSPMDMode(),
182-
"Expected SPMD mode only with uninitialized runtime.");
183-
return 1;
184-
}
185-
int level = 0; // no active level parallelism
186-
omptarget_nvptx_TaskDescr *currTaskDescr =
187-
getMyTopTaskDescriptor(isSPMDMode());
188-
ASSERT0(LT_FUSSY, currTaskDescr,
189-
"do not expect fct to be called in a non-active thread");
190-
do {
191-
if (currTaskDescr->ThreadsInTeam() > 1) {
192-
// has a parallel with more than one thread in team
193-
level = 1;
194-
break;
195-
}
196-
currTaskDescr = currTaskDescr->GetPrevTaskDescr();
197-
} while (currTaskDescr);
152+
int level = parallelLevel[GetWarpId()] > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0;
198153
PRINT(LD_IO, "call omp_get_active_level() returns %d\n", level)
199154
return level;
200155
}

openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu

Lines changed: 23 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -95,17 +95,16 @@ public:
9595
INLINE static void for_static_init(int32_t gtid, int32_t schedtype,
9696
int32_t *plastiter, T *plower, T *pupper,
9797
ST *pstride, ST chunk,
98-
bool IsSPMDExecutionMode,
99-
bool IsRuntimeUninitialized) {
98+
bool IsSPMDExecutionMode) {
10099
// When IsRuntimeUninitialized is true, we assume that the caller is
101100
// in an L0 parallel region and that all worker threads participate.
102101

103102
int tid = GetLogicalThreadIdInBlock(IsSPMDExecutionMode);
104103

105104
// Assume we are in teams region or that we use a single block
106105
// per target region
107-
ST numberOfActiveOMPThreads = GetNumberOfOmpThreads(
108-
tid, IsSPMDExecutionMode, IsRuntimeUninitialized);
106+
ST numberOfActiveOMPThreads =
107+
GetNumberOfOmpThreads(tid, IsSPMDExecutionMode);
109108

110109
// All warps that are in excess of the maximum requested, do
111110
// not execute the loop
@@ -456,9 +455,7 @@ public:
456455

457456
// automatically selects thread or warp ID based on selected implementation
458457
int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
459-
ASSERT0(LT_FUSSY,
460-
gtid < GetNumberOfOmpThreads(tid, checkSPMDMode(loc),
461-
checkRuntimeUninitialized(loc)),
458+
ASSERT0(LT_FUSSY, gtid < GetNumberOfOmpThreads(tid, checkSPMDMode(loc)),
462459
"current thread is not needed here; error");
463460
// retrieve schedule
464461
kmp_sched_t schedule =
@@ -509,13 +506,12 @@ public:
509506
*pupper = myUb;
510507
*pstride = 1;
511508

512-
PRINT(
513-
LD_LOOP,
514-
"Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, "
515-
"last %d\n",
516-
(int)GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
517-
(int)GetNumberOfWorkersInTeam(), (long long)*plower, (long long)*pupper,
518-
(long long)*pstride, (int)*plast);
509+
PRINT(LD_LOOP,
510+
"Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, "
511+
"last %d\n",
512+
(int)GetNumberOfOmpThreads(tid, isSPMDMode()),
513+
(int)GetNumberOfWorkersInTeam(), (long long)*plower,
514+
(long long)*pupper, (long long)*pstride, (int)*plast);
519515
return DISPATCH_NOTFINISHED;
520516
}
521517

@@ -629,7 +625,7 @@ EXTERN void __kmpc_for_static_init_4(kmp_Ident *loc, int32_t global_tid,
629625
PRINT0(LD_IO, "call kmpc_for_static_init_4\n");
630626
omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
631627
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
632-
checkSPMDMode(loc), checkRuntimeUninitialized(loc));
628+
checkSPMDMode(loc));
633629
}
634630

635631
EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid,
@@ -640,7 +636,7 @@ EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid,
640636
PRINT0(LD_IO, "call kmpc_for_static_init_4u\n");
641637
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
642638
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
643-
checkSPMDMode(loc), checkRuntimeUninitialized(loc));
639+
checkSPMDMode(loc));
644640
}
645641

646642
EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid,
@@ -651,7 +647,7 @@ EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid,
651647
PRINT0(LD_IO, "call kmpc_for_static_init_8\n");
652648
omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
653649
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
654-
checkSPMDMode(loc), checkRuntimeUninitialized(loc));
650+
checkSPMDMode(loc));
655651
}
656652

657653
EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid,
@@ -662,7 +658,7 @@ EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid,
662658
PRINT0(LD_IO, "call kmpc_for_static_init_8u\n");
663659
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
664660
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
665-
checkSPMDMode(loc), checkRuntimeUninitialized(loc));
661+
checkSPMDMode(loc));
666662
}
667663

668664
EXTERN
@@ -674,7 +670,7 @@ void __kmpc_for_static_init_4_simple_spmd(kmp_Ident *loc, int32_t global_tid,
674670
PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_spmd\n");
675671
omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
676672
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
677-
/*IsSPMDExecutionMode=*/true, /*IsRuntimeUninitialized=*/true);
673+
/*IsSPMDExecutionMode=*/true);
678674
}
679675

680676
EXTERN
@@ -686,7 +682,7 @@ void __kmpc_for_static_init_4u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
686682
PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_spmd\n");
687683
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
688684
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
689-
/*IsSPMDExecutionMode=*/true, /*IsRuntimeUninitialized=*/true);
685+
/*IsSPMDExecutionMode=*/true);
690686
}
691687

692688
EXTERN
@@ -698,7 +694,7 @@ void __kmpc_for_static_init_8_simple_spmd(kmp_Ident *loc, int32_t global_tid,
698694
PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_spmd\n");
699695
omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
700696
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
701-
/*IsSPMDExecutionMode=*/true, /*IsRuntimeUninitialized=*/true);
697+
/*IsSPMDExecutionMode=*/true);
702698
}
703699

704700
EXTERN
@@ -710,7 +706,7 @@ void __kmpc_for_static_init_8u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
710706
PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_spmd\n");
711707
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
712708
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
713-
/*IsSPMDExecutionMode=*/true, /*IsRuntimeUninitialized=*/true);
709+
/*IsSPMDExecutionMode=*/true);
714710
}
715711

716712
EXTERN
@@ -721,7 +717,7 @@ void __kmpc_for_static_init_4_simple_generic(
721717
PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_generic\n");
722718
omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
723719
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
724-
/*IsSPMDExecutionMode=*/false, /*IsRuntimeUninitialized=*/true);
720+
/*IsSPMDExecutionMode=*/false);
725721
}
726722

727723
EXTERN
@@ -732,7 +728,7 @@ void __kmpc_for_static_init_4u_simple_generic(
732728
PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_generic\n");
733729
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
734730
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
735-
/*IsSPMDExecutionMode=*/false, /*IsRuntimeUninitialized=*/true);
731+
/*IsSPMDExecutionMode=*/false);
736732
}
737733

738734
EXTERN
@@ -743,7 +739,7 @@ void __kmpc_for_static_init_8_simple_generic(
743739
PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_generic\n");
744740
omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
745741
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
746-
/*IsSPMDExecutionMode=*/false, /*IsRuntimeUninitialized=*/true);
742+
/*IsSPMDExecutionMode=*/false);
747743
}
748744

749745
EXTERN
@@ -754,7 +750,7 @@ void __kmpc_for_static_init_8u_simple_generic(
754750
PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_generic\n");
755751
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
756752
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
757-
/*IsSPMDExecutionMode=*/false, /*IsRuntimeUninitialized=*/true);
753+
/*IsSPMDExecutionMode=*/false);
758754
}
759755

760756
EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid) {
@@ -787,8 +783,7 @@ EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc, int32_t gtid,
787783

788784
omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
789785
int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
790-
uint32_t NumThreads = GetNumberOfOmpThreads(tid, checkSPMDMode(loc),
791-
checkRuntimeUninitialized(loc));
786+
uint32_t NumThreads = GetNumberOfOmpThreads(tid, checkSPMDMode(loc));
792787
uint64_t *Buffer = teamDescr.getLastprivateIterBuffer();
793788
for (unsigned i = 0; i < varNum; i++) {
794789
// Reset buffer.

openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@ EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
4343
ASSERT0(LT_FUSSY, RequiresOMPRuntime,
4444
"Generic always requires initialized runtime.");
4545
setExecutionParameters(Generic, RuntimeInitialized);
46+
for (int I = 0; I < MAX_THREADS_PER_TEAM / WARPSIZE; ++I)
47+
parallelLevel[I] = 0;
4648

4749
int threadIdInBlock = GetThreadIdInBlock();
4850
ASSERT0(LT_FUSSY, threadIdInBlock == GetMasterThreadID(),
@@ -91,32 +93,32 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
9193
int16_t RequiresDataSharing) {
9294
PRINT0(LD_IO, "call to __kmpc_spmd_kernel_init\n");
9395

96+
setExecutionParameters(Spmd, RequiresOMPRuntime ? RuntimeInitialized
97+
: RuntimeUninitialized);
98+
int threadId = GetThreadIdInBlock();
99+
if (threadId == 0) {
100+
usedSlotIdx = smid() % MAX_SM;
101+
parallelLevel[0] =
102+
1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0);
103+
} else if (GetLaneId() == 0) {
104+
parallelLevel[GetWarpId()] =
105+
1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0);
106+
}
94107
if (!RequiresOMPRuntime) {
95-
// If OMP runtime is not required don't initialize OMP state.
96-
setExecutionParameters(Spmd, RuntimeUninitialized);
97-
if (GetThreadIdInBlock() == 0) {
98-
usedSlotIdx = smid() % MAX_SM;
99-
parallelLevel[0] = 0;
100-
} else if (GetLaneId() == 0) {
101-
parallelLevel[GetWarpId()] = 0;
102-
}
108+
// Runtime is not required - exit.
103109
__SYNCTHREADS();
104110
return;
105111
}
106-
setExecutionParameters(Spmd, RuntimeInitialized);
107112

108113
//
109114
// Team Context Initialization.
110115
//
111116
// In SPMD mode there is no master thread so use any cuda thread for team
112117
// context initialization.
113-
int threadId = GetThreadIdInBlock();
114118
if (threadId == 0) {
115119
// Get a state object from the queue.
116-
int slot = smid() % MAX_SM;
117-
usedSlotIdx = slot;
118120
omptarget_nvptx_threadPrivateContext =
119-
omptarget_nvptx_device_State[slot].Dequeue();
121+
omptarget_nvptx_device_State[usedSlotIdx].Dequeue();
120122

121123
omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
122124
omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
@@ -148,7 +150,7 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
148150
"%d threads\n",
149151
(int)newTaskDescr->ThreadId(), (int)newTaskDescr->ThreadsInTeam());
150152

151-
if (RequiresDataSharing && threadId % WARPSIZE == 0) {
153+
if (RequiresDataSharing && GetLaneId() == 0) {
152154
// Warp master innitializes data sharing environment.
153155
unsigned WID = threadId / WARPSIZE;
154156
__kmpc_data_sharing_slot *RootS = currTeamDescr.RootS(

openmp/libomptarget/deviceRTLs/nvptx/src/option.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@
4444
#define MAX_SM 16
4545
#endif
4646

47+
#define OMP_ACTIVE_PARALLEL_LEVEL 128
48+
4749
////////////////////////////////////////////////////////////////////////////////
4850
// algo options
4951
////////////////////////////////////////////////////////////////////////////////

0 commit comments

Comments
 (0)