1+ // MIT License
2+ //
3+ // Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
4+ //
5+ // Permission is hereby granted, free of charge, to any person obtaining a copy
6+ // of this software and associated documentation files (the "Software"), to deal
7+ // in the Software without restriction, including without limitation the rights
8+ // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+ // copies of the Software, and to permit persons to whom the Software is
10+ // furnished to do so, subject to the following conditions:
11+ //
12+ // The above copyright notice and this permission notice shall be included in all
13+ // copies or substantial portions of the Software.
14+ //
15+ // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+ // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+ // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+ // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+ // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+ // SOFTWARE.
22+
23+ #include < hip/hip_runtime.h>
24+ #include < type_traits>
25+ #include < iostream>
26+ #include < vector>
27+ #include < random>
28+
29+ #define HIP_CHECK (expression ) \
30+ { \
31+ const hipError_t status = expression; \
32+ if (status != hipSuccess){ \
33+ std::cerr << " HIP error " \
34+ << status << " : " \
35+ << hipGetErrorString (status) \
36+ << " at " << __FILE__ << " :" \
37+ << __LINE__ << std::endl; \
38+ } \
39+ }
40+
41+ // [Sphinx template warp size block reduction kernel start]
42+ template <uint32_t WarpSize>
43+ using lane_mask_t = typename std::conditional<WarpSize == 32 , uint32_t , uint64_t >::type;
44+
45+ template <uint32_t WarpSize>
46+ __global__ void block_reduce (int * input, lane_mask_t <WarpSize>* mask, int * output, size_t size) {
47+ extern __shared__ int shared[];
48+
49+ // Read of input with bounds check
50+ auto read_global_safe = [&](const uint32_t i, const uint32_t lane_id, const uint32_t mask_id)
51+ {
52+ lane_mask_t <WarpSize> warp_mask = lane_mask_t <WarpSize>(1 ) << lane_id;
53+ return (i < size) && (mask[mask_id] & warp_mask) ? input[i] : 0 ;
54+ };
55+
56+ const uint32_t tid = threadIdx.x ,
57+ lid = threadIdx.x % WarpSize,
58+ wid = threadIdx.x / WarpSize,
59+ bid = blockIdx.x ,
60+ gid = bid * blockDim.x + tid;
61+
62+ // Read input buffer to shared
63+ shared[tid] = read_global_safe (gid, lid, bid * (blockDim.x / WarpSize) + wid);
64+ __syncthreads ();
65+
66+ // Shared reduction
67+ for (uint32_t i = blockDim.x / 2 ; i >= WarpSize; i /= 2 )
68+ {
69+ if (tid < i)
70+ shared[tid] = shared[tid] + shared[tid + i];
71+ __syncthreads ();
72+ }
73+
74+ // Use local variable in warp reduction
75+ int result = shared[tid];
76+ __syncthreads ();
77+
78+ // This loop would be unrolled the same with the runtime warpSize.
79+ #pragma unroll
80+ for (uint32_t i = WarpSize/2 ; i >= 1 ; i /= 2 ) {
81+ result = result + __shfl_down (result, i);
82+ }
83+
84+ // Write result to output buffer
85+ if (tid == 0 )
86+ output[bid] = result;
87+ };
88+ // [Sphinx template warp size block reduction kernel end]
89+
90+ // [Sphinx template warp size mask generation start]
91+ template <uint32_t WarpSize>
92+ void generate_and_copy_mask (
93+ void *d_mask,
94+ std::vector<int >& vectorExpected,
95+ int numOfBlocks,
96+ int numberOfWarp,
97+ int mask_size,
98+ int mask_element_size) {
99+
100+ std::random_device rd;
101+ std::mt19937_64 eng (rd ());
102+
103+ // Host side mask vector
104+ std::vector<lane_mask_t <WarpSize>> mask (mask_size);
105+ // Define uniform unsigned int distribution
106+ std::uniform_int_distribution<lane_mask_t <WarpSize>> distr;
107+ // Fill up the mask
108+ for (int i=0 ; i < numOfBlocks; i++) {
109+ int count = 0 ;
110+ for (int j=0 ; j < numberOfWarp; j++) {
111+ int mask_index = i * numberOfWarp + j;
112+ mask[mask_index] = distr (eng);
113+ if constexpr (WarpSize == 32 )
114+ count += __builtin_popcount (mask[mask_index]);
115+ else
116+ count += __builtin_popcountll (mask[mask_index]);
117+ }
118+ vectorExpected[i]= count;
119+ }
120+
121+ // Copy the mask array
122+ HIP_CHECK (hipMemcpy (d_mask, mask.data (), mask_size * mask_element_size, hipMemcpyHostToDevice));
123+ }
124+ // [Sphinx template warp size mask generation end]
125+
126+ int main () {
127+
128+ int deviceId = 0 ;
129+ int warpSizeHost;
130+ HIP_CHECK (hipDeviceGetAttribute (&warpSizeHost, hipDeviceAttributeWarpSize, deviceId));
131+ std::cout << " Warp size: " << warpSizeHost << std::endl;
132+
133+ constexpr int numOfBlocks = 16 ;
134+ constexpr int threadsPerBlock = 1024 ;
135+ const int numberOfWarp = threadsPerBlock / warpSizeHost;
136+ const int mask_element_size = warpSizeHost == 32 ? sizeof (uint32_t ) : sizeof (uint64_t );
137+ const int mask_size = numOfBlocks * numberOfWarp;
138+ constexpr size_t arraySize = numOfBlocks * threadsPerBlock;
139+
140+ int *d_data, *d_results;
141+ void *d_mask;
142+ int initValue = 1 ;
143+ std::vector<int > vectorInput (arraySize, initValue);
144+ std::vector<int > vectorOutput (numOfBlocks);
145+ std::vector<int > vectorExpected (numOfBlocks);
146+ // Allocate device memory
147+ HIP_CHECK (hipMalloc (&d_data, arraySize * sizeof (*d_data)));
148+ HIP_CHECK (hipMalloc (&d_mask, mask_size * mask_element_size));
149+ HIP_CHECK (hipMalloc (&d_results, numOfBlocks * sizeof (*d_results)));
150+ // Host to Device copy of the input array
151+ HIP_CHECK (hipMemcpy (d_data, vectorInput.data (), arraySize * sizeof (*d_data), hipMemcpyHostToDevice));
152+
153+ // [Sphinx template warp size select kernel start]
154+ // Fill up the mask variable, copy to device and select the right kernel.
155+ if (warpSizeHost == 32 ) {
156+ // Generate and copy mask arrays
157+ generate_and_copy_mask<32 >(d_mask, vectorExpected, numOfBlocks, numberOfWarp, mask_size, mask_element_size);
158+
159+ // Start the kernel
160+ block_reduce<32 ><<<dim3 (numOfBlocks), dim3 (threadsPerBlock), threadsPerBlock * sizeof (*d_data)>>>(
161+ d_data,
162+ static_cast <uint32_t *>(d_mask),
163+ d_results,
164+ arraySize);
165+ } else if (warpSizeHost == 64 ) {
166+ // Generate and copy mask arrays
167+ generate_and_copy_mask<64 >(d_mask, vectorExpected, numOfBlocks, numberOfWarp, mask_size, mask_element_size);
168+
169+ // Start the kernel
170+ block_reduce<64 ><<<dim3 (numOfBlocks), dim3 (threadsPerBlock), threadsPerBlock * sizeof (*d_data)>>>(
171+ d_data,
172+ static_cast <uint64_t *>(d_mask),
173+ d_results,
174+ arraySize);
175+ } else {
176+ std::cerr << " Unsupported warp size." << std::endl;
177+ return 0 ;
178+ }
179+ // [Sphinx template warp size select kernel end]
180+
181+ // Check the kernel launch
182+ HIP_CHECK (hipGetLastError ());
183+ // Check for kernel execution error
184+ HIP_CHECK (hipDeviceSynchronize ());
185+ // Device to Host copy of the result
186+ HIP_CHECK (hipMemcpy (vectorOutput.data (), d_results, numOfBlocks * sizeof (*d_results), hipMemcpyDeviceToHost));
187+
188+ // Verify results
189+ bool passed = true ;
190+ for (size_t i = 0 ; i < numOfBlocks; ++i) {
191+ if (vectorOutput[i] != vectorExpected[i]) {
192+ passed = false ;
193+ std::cerr << " Validation failed! Expected " << vectorExpected[i] << " got " << vectorOutput[i] << " at index: " << i << std::endl;
194+ }
195+ }
196+ if (passed){
197+ std::cout << " Execution completed successfully." << std::endl;
198+ }else {
199+ std::cerr << " Execution failed." << std::endl;
200+ }
201+
202+ // Cleanup
203+ HIP_CHECK (hipFree (d_data));
204+ HIP_CHECK (hipFree (d_mask));
205+ HIP_CHECK (hipFree (d_results));
206+ return 0 ;
207+ }
0 commit comments