|
| 1 | +// ====------ sync_warp_p2.cu---------- *- CUDA -* ----===//// |
| 2 | +// |
| 3 | +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | +// See https://llvm.org/LICENSE.txt for license information. |
| 5 | +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | +// |
| 7 | +// |
| 8 | +// ===----------------------------------------------------------------------===// |
| 9 | +#include <iostream> |
| 10 | +#include <cuda_runtime.h> |
| 11 | +#include <vector> |
| 12 | +#include <set> |
| 13 | +#define WARP_SIZE 32 |
| 14 | +#define DATA_NUM 128 |
| 15 | + |
| 16 | +template<typename T = int> |
| 17 | +void init_data(T* data, int num) { |
| 18 | + std::vector<T> host_data(num); |
| 19 | + for(int i = 0; i < num; i++) |
| 20 | + host_data[i] = i + 3; |
| 21 | + cudaMemcpy(data, host_data.data(), num * sizeof(T), cudaMemcpyHostToDevice); |
| 22 | +} |
| 23 | + |
| 24 | +template<typename T = int> |
| 25 | +bool verify_data(T* data, T* expect, int num, int step = 1, std::set<int> ignore_index = {}) { |
| 26 | + std::vector<T> host_data(num); |
| 27 | + cudaMemcpy(host_data.data(), data, num * sizeof(T), cudaMemcpyDeviceToHost); |
| 28 | + for(int i = 0; i < num; i = i + step) { |
| 29 | + if(ignore_index.count(i)) |
| 30 | + continue; |
| 31 | + if(host_data[i] != expect[i]) { |
| 32 | + return false; |
| 33 | + } |
| 34 | + } |
| 35 | + return true; |
| 36 | +} |
| 37 | + |
| 38 | +template<typename T = int> |
| 39 | +void print_data(T* data, int num, bool is_host = false) { |
| 40 | + if(is_host) { |
| 41 | + for (int i = 0; i < num; i++) { |
| 42 | + std::cout << data[i] << ", "; |
| 43 | + if((i+1)%16 == 0) |
| 44 | + std::cout << std::endl; |
| 45 | + } |
| 46 | + std::cout << std::endl; |
| 47 | + return; |
| 48 | + } |
| 49 | + std::vector<T> host_data(num); |
| 50 | + cudaMemcpy(host_data.data(), data, num * sizeof(T), cudaMemcpyDeviceToHost); |
| 51 | + for (int i = 0; i < num; i++) { |
| 52 | + std::cout << host_data[i] << ", "; |
| 53 | + if((i+1)%16 == 0) |
| 54 | + std::cout << std::endl; |
| 55 | + } |
| 56 | + std::cout << std::endl; |
| 57 | +} |
| 58 | + |
| 59 | + |
| 60 | +//sync API |
| 61 | +__global__ void ShuffleSyncKernel1(unsigned int* data) { |
| 62 | + int threadid = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y + blockIdx.x * blockDim.x * blockDim.y * blockDim.z; |
| 63 | + int output = 0; |
| 64 | + unsigned int mask = 0xFFFFFFF0; |
| 65 | + output = __shfl_sync(mask, threadid, threadid + 1, 16); |
| 66 | + data[threadid] = output; |
| 67 | +} |
| 68 | +__global__ void ShuffleUpSyncKernel1(unsigned int* data) { |
| 69 | + int threadid = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y + blockIdx.x * blockDim.x * blockDim.y * blockDim.z; |
| 70 | + int output = 0; |
| 71 | + unsigned int mask = 0xFFFFFFF0; |
| 72 | + output = __shfl_up_sync(mask, threadid, 1, 16); |
| 73 | + data[threadid] = output; |
| 74 | +} |
| 75 | +__global__ void ShuffleDownSyncKernel1(unsigned int* data) { |
| 76 | + int threadid = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y + blockIdx.x * blockDim.x * blockDim.y * blockDim.z; |
| 77 | + int output = 0; |
| 78 | + unsigned int mask = 0xFFFFFFF0; |
| 79 | + output = __shfl_down_sync(mask, threadid, 1, 16); |
| 80 | + data[threadid] = output; |
| 81 | +} |
| 82 | +__global__ void ShuffleXorSyncKernel1(unsigned int* data) { |
| 83 | + int threadid = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y + blockIdx.x * blockDim.x * blockDim.y * blockDim.z; |
| 84 | + int output = 0; |
| 85 | + unsigned int mask = 0xFFFFFFF0; |
| 86 | + output = __shfl_xor_sync(mask, threadid, 2, 16); |
| 87 | + data[threadid] = output; |
| 88 | +} |
| 89 | + |
| 90 | +//has branch1 |
| 91 | +__global__ void ShuffleSyncKernel2(unsigned int* data) { |
| 92 | + int threadid = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y + blockIdx.x * blockDim.x * blockDim.y * blockDim.z; |
| 93 | + int output = 0; |
| 94 | + unsigned int mask = 0xFFFFFFF0; |
| 95 | + if(threadid%32 >3) { |
| 96 | + output = __shfl_sync(mask, threadid, threadid + 1, 16); |
| 97 | + } |
| 98 | + data[threadid] = output; |
| 99 | +} |
| 100 | +__global__ void ShuffleUpSyncKernel2(unsigned int* data) { |
| 101 | + int threadid = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y + blockIdx.x * blockDim.x * blockDim.y * blockDim.z; |
| 102 | + int output = 0; |
| 103 | + unsigned int mask = 0xFFFFFFF0; |
| 104 | + if(threadid%32 >3) { |
| 105 | + output = __shfl_up_sync(mask, threadid, 1, 16); |
| 106 | + } |
| 107 | + data[threadid] = output; |
| 108 | +} |
| 109 | +__global__ void ShuffleDownSyncKernel2(unsigned int* data) { |
| 110 | + int threadid = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y + blockIdx.x * blockDim.x * blockDim.y * blockDim.z; |
| 111 | + int output = 0; |
| 112 | + unsigned int mask = 0x0FFFFFFF; |
| 113 | + if(threadid%32 < 28) { |
| 114 | + output = __shfl_down_sync(mask, threadid, 1, 16); |
| 115 | + } |
| 116 | + data[threadid] = output; |
| 117 | +} |
| 118 | +__global__ void ShuffleXorSyncKernel2(unsigned int* data) { |
| 119 | + int threadid = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y + blockIdx.x * blockDim.x * blockDim.y * blockDim.z; |
| 120 | + int output = 0; |
| 121 | + unsigned int mask = 0xFFFFFFF0; |
| 122 | + if(threadid%32 >3) { |
| 123 | + output = __shfl_xor_sync(mask, threadid, 2, 16); |
| 124 | + } |
| 125 | + data[threadid] = output; |
| 126 | +} |
| 127 | + |
| 128 | +// has branch 2 |
| 129 | +__global__ void ShuffleSyncKernel3(unsigned int* data) { |
| 130 | + int threadid = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y + blockIdx.x * blockDim.x * blockDim.y * blockDim.z; |
| 131 | + int output = 0; |
| 132 | + unsigned int mask = 0xFFFFFFF0; |
| 133 | + if(threadid%32 >3) { |
| 134 | + unsigned remote = threadid ? threadid - 1 : 0; |
| 135 | + output = __shfl_sync(mask, threadid, remote , 16); |
| 136 | + } |
| 137 | + data[threadid] = output; |
| 138 | +} |
| 139 | + |
| 140 | +int main() { |
| 141 | + bool Result = true; |
| 142 | + int* dev_data = nullptr; |
| 143 | + unsigned int *dev_data_u = nullptr; |
| 144 | + std::set<int> ignore_index; |
| 145 | + dim3 GridSize; |
| 146 | + dim3 BlockSize; |
| 147 | + cudaMalloc(&dev_data, DATA_NUM * sizeof(int)); |
| 148 | + cudaMalloc(&dev_data_u, DATA_NUM * sizeof(unsigned int)); |
| 149 | + GridSize = {2}; |
| 150 | + BlockSize = {32, 2, 1}; |
| 151 | + // NV hardware result reference |
| 152 | + unsigned int expect1[DATA_NUM] = { |
| 153 | + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 16, |
| 154 | + 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 32, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 48, |
| 155 | + 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 64, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 80, |
| 156 | + 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 96, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 112, |
| 157 | + }; |
| 158 | + init_data<unsigned int>(dev_data_u, DATA_NUM); |
| 159 | + |
| 160 | + ShuffleSyncKernel1<<<GridSize, BlockSize>>>(dev_data_u); |
| 161 | + |
| 162 | + cudaDeviceSynchronize(); |
| 163 | + if(!verify_data<unsigned int>(dev_data_u, expect1, DATA_NUM)) { |
| 164 | + std::cout << "ShuffleSyncKernel1" << " verify failed" << std::endl; |
| 165 | + Result = false; |
| 166 | + std::cout << "expect:" << std::endl; |
| 167 | + print_data<unsigned int>(expect1, DATA_NUM, true); |
| 168 | + std::cout << "current result:" << std::endl; |
| 169 | + print_data<unsigned int>(dev_data_u, DATA_NUM); |
| 170 | + } |
| 171 | + |
| 172 | + GridSize = {2}; |
| 173 | + BlockSize = {32, 2, 1}; |
| 174 | + // NV hardware result reference |
| 175 | + unsigned int expect2[DATA_NUM] = { |
| 176 | + 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, |
| 177 | + 32, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 48, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, |
| 178 | + 64, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 80, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, |
| 179 | + 96, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, |
| 180 | + }; |
| 181 | + init_data<unsigned int>(dev_data_u, DATA_NUM); |
| 182 | + |
| 183 | + ShuffleUpSyncKernel1<<<GridSize, BlockSize>>>(dev_data_u); |
| 184 | + |
| 185 | + cudaDeviceSynchronize(); |
| 186 | + if(!verify_data<unsigned int>(dev_data_u, expect2, DATA_NUM)) { |
| 187 | + std::cout << "ShuffleUpSyncKernel1" << " verify failed" << std::endl; |
| 188 | + Result = false; |
| 189 | + std::cout << "expect:" << std::endl; |
| 190 | + print_data<unsigned int>(expect2, DATA_NUM, true); |
| 191 | + std::cout << "current result:" << std::endl; |
| 192 | + print_data<unsigned int>(dev_data_u, DATA_NUM); |
| 193 | + } |
| 194 | + |
| 195 | + |
| 196 | + GridSize = {2}; |
| 197 | + BlockSize = {32, 2, 1}; |
| 198 | + // NV hardware result reference |
| 199 | + unsigned int expect3[DATA_NUM] = { |
| 200 | + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 31, |
| 201 | + 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 47, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 63, |
| 202 | + 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 79, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 95, |
| 203 | + 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 111, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 127, |
| 204 | + }; |
| 205 | + init_data<unsigned int>(dev_data_u, DATA_NUM); |
| 206 | + |
| 207 | + ShuffleDownSyncKernel1<<<GridSize, BlockSize>>>(dev_data_u); |
| 208 | + |
| 209 | + cudaDeviceSynchronize(); |
| 210 | + if(!verify_data<unsigned int>(dev_data_u, expect3, DATA_NUM)) { |
| 211 | + std::cout << "ShuffleDownSyncKernel1" << " verify failed" << std::endl; |
| 212 | + Result = false; |
| 213 | + std::cout << "expect:" << std::endl; |
| 214 | + print_data<unsigned int>(expect3, DATA_NUM, true); |
| 215 | + std::cout << "current result:" << std::endl; |
| 216 | + print_data<unsigned int>(dev_data_u, DATA_NUM); |
| 217 | + } |
| 218 | + |
| 219 | + GridSize = {2}; |
| 220 | + BlockSize = {32, 2, 1}; |
| 221 | + // NV hardware result reference |
| 222 | + unsigned int expect4[DATA_NUM] = { |
| 223 | + 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31, |
| 224 | + 28,29,34,35,32,33,38,39,36,37,42,43,40,41,46,47,44,45,50,51,48,49,54,55,52,53,58, |
| 225 | + 59,56,57,62,63,60,61,66,67,64,65,70,71,68,69,74,75,72,73,78,79,76,77,82,83,80,81, |
| 226 | + 86,87,84,85,90,91,88,89,94,95,92,93,98,99,96,97,102,103,100,101,106,107,104,105, |
| 227 | + 110,111,108,109,114,115,112,113,118,119,116,117,122,123,120,121,126,127,124,125 |
| 228 | + }; |
| 229 | + init_data<unsigned int>(dev_data_u, DATA_NUM); |
| 230 | + |
| 231 | + ShuffleXorSyncKernel1<<<GridSize, BlockSize>>>(dev_data_u); |
| 232 | + |
| 233 | + cudaDeviceSynchronize(); |
| 234 | + if(!verify_data<unsigned int>(dev_data_u, expect4, DATA_NUM)) { |
| 235 | + std::cout << "ShuffleXorSyncKernel1" << " verify failed" << std::endl; |
| 236 | + Result = false; |
| 237 | + std::cout << "expect:" << std::endl; |
| 238 | + print_data<unsigned int>(expect4, DATA_NUM, true); |
| 239 | + std::cout << "current result:" << std::endl; |
| 240 | + print_data<unsigned int>(dev_data_u, DATA_NUM); |
| 241 | + } |
| 242 | + |
| 243 | +// has branch 1 |
| 244 | +GridSize = {2}; |
| 245 | +BlockSize = {32, 2, 1}; |
| 246 | + // NV hardware result reference |
| 247 | +unsigned int expect5[DATA_NUM] = { |
| 248 | + 0, 0, 0, 0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 16, |
| 249 | + 0, 0, 0, 0, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 0, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 48, |
| 250 | + 0, 0, 0, 0, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 0, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 80, |
| 251 | + 0, 0, 0, 0, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 0, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 112, |
| 252 | +}; |
| 253 | +init_data<unsigned int>(dev_data_u, DATA_NUM); |
| 254 | + |
| 255 | +ShuffleSyncKernel2<<<GridSize, BlockSize>>>(dev_data_u); |
| 256 | + |
| 257 | +cudaDeviceSynchronize(); |
| 258 | +// The result[15, 47, 79, 111] is undefined, so ignore those value. |
| 259 | +ignore_index = {15, 47, 79, 111}; |
| 260 | +if(!verify_data<unsigned int>(dev_data_u, expect5, DATA_NUM, 1, ignore_index)) { |
| 261 | + std::cout << "ShuffleSyncKernel2" << " verify failed" << std::endl; |
| 262 | + Result = false; |
| 263 | + std::cout << "expect:" << std::endl; |
| 264 | + print_data<unsigned int>(expect5, DATA_NUM, true); |
| 265 | + std::cout << "current result:" << std::endl; |
| 266 | + print_data<unsigned int>(dev_data_u, DATA_NUM); |
| 267 | +} |
| 268 | + |
| 269 | +GridSize = {2}; |
| 270 | +BlockSize = {32, 2, 1}; |
| 271 | + // NV hardware result reference |
| 272 | +unsigned int expect6[DATA_NUM] = { |
| 273 | + 0, 0, 0, 0, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, |
| 274 | + 0, 0, 0, 0, 0, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 48, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, |
| 275 | + 0, 0, 0, 0, 0, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 80, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, |
| 276 | + 0, 0, 0, 0, 0, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, |
| 277 | +}; |
| 278 | +init_data<unsigned int>(dev_data_u, DATA_NUM); |
| 279 | + |
| 280 | +ShuffleUpSyncKernel2<<<GridSize, BlockSize>>>(dev_data_u); |
| 281 | + |
| 282 | +cudaDeviceSynchronize(); |
| 283 | +// The result[4, 36, 68, 100] is undefined, so ignore those value. |
| 284 | +ignore_index = {4, 36, 68, 100}; |
| 285 | +if(!verify_data<unsigned int>(dev_data_u, expect6, DATA_NUM, 1, ignore_index)) { |
| 286 | + std::cout << "ShuffleUpSyncKernel2" << " verify failed" << std::endl; |
| 287 | + Result = false; |
| 288 | + std::cout << "expect:" << std::endl; |
| 289 | + print_data<unsigned int>(expect6, DATA_NUM, true); |
| 290 | + std::cout << "current result:" << std::endl; |
| 291 | + print_data<unsigned int>(dev_data_u, DATA_NUM); |
| 292 | +} |
| 293 | + |
| 294 | + |
| 295 | +GridSize = {2}; |
| 296 | +BlockSize = {32, 2, 1}; |
| 297 | + // NV hardware result reference |
| 298 | + // The result[27/59/91/123] of _shfl_down function in delta 4 and logical warp size 16 is undefined. |
| 299 | + // But the SYCL version return 28/60/92/124, so we change these 4 number in reference to result of |
| 300 | + // SYCL version function. |
| 301 | +unsigned int expect7[DATA_NUM] = { |
| 302 | + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 0, 0, 0, 0, 0, |
| 303 | + 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 47, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 0, 0, 0, 0, 0, |
| 304 | + 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 79, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 0, 0, 0, 0, 0, |
| 305 | + 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 111, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 0, 0, 0, 0, 0, |
| 306 | +}; |
| 307 | +init_data<unsigned int>(dev_data_u, DATA_NUM); |
| 308 | + |
| 309 | +ShuffleDownSyncKernel2<<<GridSize, BlockSize>>>(dev_data_u); |
| 310 | + |
| 311 | +cudaDeviceSynchronize(); |
| 312 | +// The result[27, 59, 91, 123] is undefined, so ignore those value. |
| 313 | +ignore_index = {27, 59, 91, 123}; |
| 314 | +if(!verify_data<unsigned int>(dev_data_u, expect7, DATA_NUM, 1, ignore_index)) { |
| 315 | + std::cout << "ShuffleDownSyncKernel2" << " verify failed" << std::endl; |
| 316 | + Result = false; |
| 317 | + std::cout << "expect:" << std::endl; |
| 318 | + print_data<unsigned int>(expect7, DATA_NUM, true); |
| 319 | + std::cout << "current result:" << std::endl; |
| 320 | + print_data<unsigned int>(dev_data_u, DATA_NUM); |
| 321 | +} |
| 322 | + |
| 323 | +GridSize = {2}; |
| 324 | +BlockSize = {32, 2, 1}; |
| 325 | + // NV hardware result reference |
| 326 | +unsigned int expect8[DATA_NUM] = { |
| 327 | + 0, 0, 0, 0, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13, 18, 19, 16, 17, 22, 23, 20, 21, 26, 27, 24, 25, 30, 31, 28, 29, |
| 328 | + 0, 0, 0, 0, 38, 39, 36, 37, 42, 43, 40, 41, 46, 47, 44, 45, 50, 51, 48, 49, 54, 55, 52, 53, 58, 59, 56, 57, 62, 63, 60, 61, |
| 329 | + 0, 0, 0, 0, 70, 71, 68, 69, 74, 75, 72, 73, 78, 79, 76, 77, 82, 83, 80, 81, 86, 87, 84, 85, 90, 91, 88, 89, 94, 95, 92, 93, |
| 330 | + 0, 0, 0, 0, 102, 103, 100, 101, 106, 107, 104, 105, 110, 111, 108, 109, 114, 115, 112, 113, 118, 119, 116, 117, 122, 123, 120, 121, 126, 127, 124, 125, |
| 331 | +}; |
| 332 | +init_data<unsigned int>(dev_data_u, DATA_NUM); |
| 333 | + |
| 334 | +ShuffleXorSyncKernel2<<<GridSize, BlockSize>>>(dev_data_u); |
| 335 | + |
| 336 | +cudaDeviceSynchronize(); |
| 337 | +if(!verify_data<unsigned int>(dev_data_u, expect8, DATA_NUM)) { |
| 338 | + std::cout << "ShuffleXorSyncKernel2" << " verify failed" << std::endl; |
| 339 | + Result = false; |
| 340 | + std::cout << "expect:" << std::endl; |
| 341 | + print_data<unsigned int>(expect8, DATA_NUM, true); |
| 342 | + std::cout << "current result:" << std::endl; |
| 343 | + print_data<unsigned int>(dev_data_u, DATA_NUM); |
| 344 | +} |
| 345 | + |
| 346 | +// has branch 2 |
| 347 | + |
| 348 | +GridSize = {2}; |
| 349 | +BlockSize = {32, 2, 1}; |
| 350 | + // NV hardware result reference |
| 351 | +unsigned int expect9[DATA_NUM] = { |
| 352 | + 0, 0, 0, 0, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 31, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, |
| 353 | + 0, 0, 0, 0, 0, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 63, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, |
| 354 | + 0, 0, 0, 0, 0, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 95, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, |
| 355 | + 0, 0, 0, 0, 0, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 127, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, |
| 356 | +}; |
| 357 | +init_data<unsigned int>(dev_data_u, DATA_NUM); |
| 358 | + |
| 359 | +ShuffleSyncKernel3<<<GridSize, BlockSize>>>(dev_data_u); |
| 360 | + |
| 361 | +cudaDeviceSynchronize(); |
| 362 | +// The result[4, 36, 68, 100] is undefined, so ignore those value. |
| 363 | +ignore_index = {4, 36, 68, 100}; |
| 364 | +if(!verify_data<unsigned int>(dev_data_u, expect9, DATA_NUM, 1, ignore_index)) { |
| 365 | + std::cout << "ShuffleSyncKernel3" << " verify failed" << std::endl; |
| 366 | + Result = false; |
| 367 | + std::cout << "expect:" << std::endl; |
| 368 | + print_data<unsigned int>(expect9, DATA_NUM, true); |
| 369 | + std::cout << "current result:" << std::endl; |
| 370 | + print_data<unsigned int>(dev_data_u, DATA_NUM); |
| 371 | +} |
| 372 | + |
| 373 | + if(Result) |
| 374 | + std::cout << "passed" << std::endl; |
| 375 | + else { |
| 376 | + exit(-1); |
| 377 | + } |
| 378 | + return 0; |
| 379 | +} |
| 380 | + |
| 381 | + |
0 commit comments