diff --git a/features/feature_case/asm/asm_bar.cu b/features/feature_case/asm/asm_bar.cu index 5434edd9..dbf7193b 100644 --- a/features/feature_case/asm/asm_bar.cu +++ b/features/feature_case/asm/asm_bar.cu @@ -9,7 +9,7 @@ #include -__device__ void bar(int *arr, int *brr) { +__global__ void bar_warp_sync(int *arr, int *brr) { arr[threadIdx.x] = threadIdx.x + 10; if (threadIdx.x % 2 == 0) { for (int i = 0; i < 1000; ++i) @@ -18,7 +18,7 @@ __device__ void bar(int *arr, int *brr) { arr[threadIdx.x] = 0; } - asm volatile ("bar.warp.sync %0;" :: "r"(0b1010101010)); + asm volatile("bar.warp.sync %0;" ::"r"(0b1010101010)); if (threadIdx.x == 1) { for (int i = 0; i < 10; ++i) { brr[i] = arr[i]; @@ -26,24 +26,88 @@ __device__ void bar(int *arr, int *brr) { } } -__global__ void kernel(int *arr, int *brr) { - bar(arr, brr); -} - -int main() { +int test_bar_warp_sync() { int *arr, *brr; cudaMallocManaged(&arr, sizeof(int) * 10); cudaMemset(arr, 0, sizeof(int) * 10); cudaMallocManaged(&brr, sizeof(int) * 10); cudaMemset(brr, 0, sizeof(int) * 10); - kernel<<<1, 10>>>(arr, brr); + bar_warp_sync<<<1, 10>>>(arr, brr); cudaDeviceSynchronize(); cudaFree(arr); int res = 0; - for (int i = 1; i < 10; i+= 2) + for (int i = 1; i < 10; i += 2) if (brr[i] != i + 10 || brr[i - 1] != 0) res = 1; cudaFree(brr); return res; } + +constexpr unsigned int WIDTH = 10; +constexpr unsigned int HEIGHT = 32 * 4; + +__global__ void bar_sync_bar_arrive(int *arr, int *results) { + __shared__ int buffer[HEIGHT / 2]; + int value; + + // Initialize provider threads + if (threadIdx.x < HEIGHT / 2) + value = arr[threadIdx.x * WIDTH] + arr[(threadIdx.x + HEIGHT / 2) * WIDTH]; + else + value = 0; + + for (unsigned int i = 1; i < WIDTH + 1; ++i) { + if (threadIdx.x < HEIGHT / 2) { + // provider + buffer[threadIdx.x] = value; + asm volatile("bar.arrive 1, %0;" ::"r"(HEIGHT)); + value = arr[threadIdx.x * WIDTH + i] + + arr[(threadIdx.x + HEIGHT / 2) * WIDTH + i]; + asm volatile("bar.sync 2, %0;" ::"r"(HEIGHT)); + } else { + // consumer + asm volatile("bar.sync 1, %0;" ::"r"(HEIGHT)); + value = buffer[threadIdx.x - HEIGHT / 2]; + asm volatile("bar.arrive 2, %0;" ::"r"(HEIGHT)); + results[threadIdx.x - HEIGHT / 2] += value; + } + } +} + +int test_bar_sync_bar_arrive() { + int *arr, *results; + + cudaMallocManaged(&arr, sizeof(int) * WIDTH * HEIGHT); + for (unsigned int i = 0; i < WIDTH * HEIGHT; ++i) { + arr[i] = i; + } + + cudaMallocManaged(&results, sizeof(int) * HEIGHT / 2); + cudaMemset(results, 0, sizeof(int) * HEIGHT / 2); + + bar_sync_bar_arrive<<<1, HEIGHT>>>(arr, results); + cudaDeviceSynchronize(); + + int expected = 0; + for (unsigned int i = 0; i < WIDTH * HEIGHT; ++i) { + expected += i; + } + int result = 0; + for (unsigned int i = 0; i < HEIGHT / 2; ++i) { + result += results[i]; + } + + int res = result != expected; + + cudaFree(arr); + cudaFree(results); + return res; +} + +int main() { + int res = 0; + res |= test_bar_warp_sync(); + res |= test_bar_sync_bar_arrive(); + return res; +}