@@ -44,45 +44,119 @@ void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
4444 }
4545}
4646
47- static int32_t nvptx_parallel_reduce_nowait (void *reduce_data,
47+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
48+ static uint32_t gpu_irregular_simd_reduce (void *reduce_data,
49+ ShuffleReductFnTy shflFct) {
50+ uint32_t size, remote_id, physical_lane_id;
51+ physical_lane_id = mapping::getThreadIdInBlock () % mapping::getWarpSize ();
52+ __kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT ();
53+ __kmpc_impl_lanemask_t Liveness = mapping::activemask ();
54+ uint32_t logical_lane_id = utils::popc (Liveness & lanemask_lt) * 2 ;
55+ __kmpc_impl_lanemask_t lanemask_gt = mapping::lanemaskGT ();
56+ do {
57+ Liveness = mapping::activemask ();
58+ remote_id = utils::ffs (Liveness & lanemask_gt);
59+ size = utils::popc (Liveness);
60+ logical_lane_id /= 2 ;
61+ shflFct (reduce_data, /* LaneId =*/ logical_lane_id,
62+ /* Offset=*/ remote_id - 1 - physical_lane_id, /* AlgoVersion=*/ 2 );
63+ } while (logical_lane_id % 2 == 0 && size > 1 );
64+ return (logical_lane_id == 0 );
65+ }
66+ #endif
67+
68+ static int32_t nvptx_parallel_reduce_nowait (int32_t TId, int32_t num_vars,
69+ uint64_t reduce_size,
70+ void *reduce_data,
4871 ShuffleReductFnTy shflFct,
49- InterWarpCopyFnTy cpyFct) {
72+ InterWarpCopyFnTy cpyFct,
73+ bool isSPMDExecutionMode, bool ) {
74+ uint32_t BlockThreadId = mapping::getThreadIdInBlock ();
75+ if (mapping::isMainThreadInGenericMode (/* IsSPMD */ false ))
76+ BlockThreadId = 0 ;
5077 uint32_t NumThreads = omp_get_num_threads ();
51- // Handle degenerated parallel regions, including all nested ones, first.
5278 if (NumThreads == 1 )
5379 return 1 ;
54-
55- /*
56- * 1. Reduce within a warp.
57- * 2. Warp master copies value to warp 0 via shared memory.
58- * 3. Warp 0 reduces to a single value.
59- * 4. The reduced value is available in the thread that returns 1.
60- */
61-
62- uint32_t BlockThreadId = mapping::getThreadIdInBlock ();
63- uint32_t NumWarps =
80+ /*
81+ * This reduce function handles reduction within a team. It handles
82+ * parallel regions in both L1 and L2 parallelism levels. It also
83+ * supports Generic, SPMD, and NoOMP modes.
84+ *
85+ * 1. Reduce within a warp.
86+ * 2. Warp master copies value to warp 0 via shared memory.
87+ * 3. Warp 0 reduces to a single value.
88+ * 4. The reduced value is available in the thread that returns 1.
89+ */
90+
91+ #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
92+ uint32_t WarpsNeeded =
6493 (NumThreads + mapping::getWarpSize () - 1 ) / mapping::getWarpSize ();
94+ uint32_t WarpId = mapping::getWarpIdInBlock ();
6595
96+ // Volta execution model:
6697 // For the Generic execution mode a parallel region either has 1 thread and
6798 // beyond that, always a multiple of 32. For the SPMD execution mode we may
6899 // have any number of threads.
69- gpu_regular_warp_reduce (reduce_data, shflFct);
100+ if ((NumThreads % mapping::getWarpSize () == 0 ) || (WarpId < WarpsNeeded - 1 ))
101+ gpu_regular_warp_reduce (reduce_data, shflFct);
102+ else if (NumThreads > 1 ) // Only SPMD execution mode comes thru this case.
103+ gpu_irregular_warp_reduce (reduce_data, shflFct,
104+ /* LaneCount=*/ NumThreads % mapping::getWarpSize (),
105+ /* LaneId=*/ mapping::getThreadIdInBlock () %
106+ mapping::getWarpSize ());
70107
71108 // When we have more than [mapping::getWarpSize()] number of threads
72109 // a block reduction is performed here.
110+ //
111+ // Only L1 parallel region can enter this if condition.
73112 if (NumThreads > mapping::getWarpSize ()) {
74113 // Gather all the reduced values from each warp
75114 // to the first warp.
76- cpyFct (reduce_data, NumWarps );
115+ cpyFct (reduce_data, WarpsNeeded );
77116
78- if (BlockThreadId < mapping::getWarpSize ())
79- gpu_irregular_warp_reduce (reduce_data, shflFct, NumWarps, BlockThreadId);
117+ if (WarpId == 0 )
118+ gpu_irregular_warp_reduce (reduce_data, shflFct, WarpsNeeded,
119+ BlockThreadId);
80120 }
81-
82- // In Generic and in SPMD mode block thread Id 0 is what we want.
83- // It's either the main thread in SPMD mode or the "acting" main thread in the
84- // parallel region.
85121 return BlockThreadId == 0 ;
122+ #else
123+ __kmpc_impl_lanemask_t Liveness = mapping::activemask ();
124+ if (Liveness == lanes::All) // Full warp
125+ gpu_regular_warp_reduce (reduce_data, shflFct);
126+ else if (!(Liveness & (Liveness + 1 ))) // Partial warp but contiguous lanes
127+ gpu_irregular_warp_reduce (reduce_data, shflFct,
128+ /* LaneCount=*/ utils::popc (Liveness),
129+ /* LaneId=*/ mapping::getThreadIdInBlock () %
130+ mapping::getWarpSize ());
131+ else { // Dispersed lanes. Only threads in L2
132+ // parallel region may enter here; return
133+ // early.
134+ return gpu_irregular_simd_reduce (reduce_data, shflFct);
135+ }
136+
137+ // When we have more than [mapping::getWarpSize()] number of threads
138+ // a block reduction is performed here.
139+ //
140+ // Only L1 parallel region can enter this if condition.
141+ if (NumThreads > mapping::getWarpSize ()) {
142+ uint32_t WarpsNeeded =
143+ (NumThreads + mapping::getWarpSize () - 1 ) / mapping::getWarpSize ();
144+ // Gather all the reduced values from each warp
145+ // to the first warp.
146+ cpyFct (reduce_data, WarpsNeeded);
147+
148+ uint32_t WarpId = BlockThreadId / mapping::getWarpSize ();
149+ if (WarpId == 0 )
150+ gpu_irregular_warp_reduce (reduce_data, shflFct, WarpsNeeded,
151+ BlockThreadId);
152+
153+ return BlockThreadId == 0 ;
154+ }
155+
156+ // Get the OMP thread Id. This is different from BlockThreadId in the case of
157+ // an L2 parallel region.
158+ return TId == 0 ;
159+ #endif // __CUDA_ARCH__ >= 700
86160}
87161
88162uint32_t roundToWarpsize (uint32_t s) {
@@ -99,7 +173,9 @@ extern "C" {
99173int32_t __kmpc_nvptx_parallel_reduce_nowait_v2 (
100174 IdentTy *Loc, int32_t TId, int32_t num_vars, uint64_t reduce_size,
101175 void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct) {
102- return nvptx_parallel_reduce_nowait (reduce_data, shflFct, cpyFct);
176+ return nvptx_parallel_reduce_nowait (TId, num_vars, reduce_size, reduce_data,
177+ shflFct, cpyFct, mapping::isSPMDMode (),
178+ false );
103179}
104180
105181// / Mostly like _v2 but with the builtin assumption that we have less than
0 commit comments