diff --git a/features/feature_case/asm/asm_ld.cu b/features/feature_case/asm/asm_ld.cu index b6e3024d..b05fcfe5 100644 --- a/features/feature_case/asm/asm_ld.cu +++ b/features/feature_case/asm/asm_ld.cu @@ -1,4 +1,4 @@ -// ====------ asm_ld.cu ----------------------------------- *- CUDA -* ---===// +// ====------ asm_ld.cu ----------------------------------- *- CUDA -* ---===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -8,13 +8,21 @@ // ===---------------------------------------------------------------------===// #include +#include +#include +#include -#define TEST(FN) \ +using bf16 = __nv_bfloat16; +using bf16_2 = __nv_bfloat162; +using half_2 = __half2; + + +#define TEST(FUNC_CALL) \ { \ - if (FN()) { \ - printf("Test " #FN " PASS\n"); \ + if (FUNC_CALL) { \ + printf("Test " #FUNC_CALL " PASS\n"); \ } else { \ - printf("Test " #FN " FAIL\n"); \ + printf("Test " #FUNC_CALL " FAIL\n"); \ return 1; \ } \ } @@ -126,9 +134,555 @@ bool test_3() { return (h_result == h_flag_value) ? true : false; } + +__device__ uint16_t ptx_ld_global_b16(uint16_t *src) { + uint16_t dst; + asm volatile("ld.global.b16 %0, [%1];" : "=h"(dst) : "l"(src)); + return dst; +} + +__device__ uint16_t ptx_ld_shared_b16(uint16_t *src) { + uint16_t dst; + uint64_t addr = static_cast(__cvta_generic_to_shared(src)); + asm volatile("ld.shared.b16 %0, [%1];" : "=h"(dst) : "l"(addr)); + return dst; +} + +__global__ void test_kernel_ld_global_b16(uint16_t *d_src, uint16_t *d_dst) { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + d_dst[idx] = ptx_ld_global_b16(&d_src[idx]); +} + +__global__ void test_kernel_ld_shared_b16(uint16_t *d_src, uint16_t *d_dst, + int N) { + extern __shared__ uint16_t smem_b16[]; + + int idx = threadIdx.x + blockIdx.x * blockDim.x; + + // Copy data in global memory to shared memory + if (idx < N) { + smem_b16[threadIdx.x] = d_src[idx]; + } + + // Wait until the shared memory operatoins are done. + __syncthreads(); + + if (idx < N) { + d_dst[idx] = ptx_ld_shared_b16(&smem_b16[threadIdx.x]); + } +} + +bool test_ptx_ld_bf16(bool global_mem) { + const int N = 4; + bf16 h_src[N] = {__float2bfloat16(1.5f), __float2bfloat16(2.0f), + __float2bfloat16(2.5f), __float2bfloat16(3.0f)}; + bf16 h_dst[N] = {0}; + + bf16 *d_src = nullptr; + bf16 *d_dst = nullptr; + + cudaMalloc(&d_src, N * sizeof(bf16)); + cudaMalloc(&d_dst, N * sizeof(bf16)); + + cudaMemcpy(d_src, h_src, N * sizeof(bf16), cudaMemcpyHostToDevice); + cudaMemset(d_dst, 0, N * sizeof(bf16)); + + if (global_mem) { + test_kernel_ld_global_b16<<<1, N>>>(reinterpret_cast(d_src), + reinterpret_cast(d_dst)); + } else { + test_kernel_ld_shared_b16<<<1, N>>>(reinterpret_cast(d_src), + reinterpret_cast(d_dst), N); + } + cudaDeviceSynchronize(); + + cudaMemcpy(h_dst, d_dst, N * sizeof(bf16), cudaMemcpyDeviceToHost); + + bool res = true; + for (int i = 0; i < N; ++i) { + if (h_dst[i] != h_src[i]) { + res = false; + printf("error: h_dst[%d] = %f (expected %f)\n", i, + __bfloat162float(h_dst[i]), __bfloat162float(h_src[i])); + } + printf("h_dst[%d] = %f (expected %f)\n", i, __bfloat162float(h_dst[i]), + __bfloat162float(h_src[i])); + } + + cudaFree(d_src); + cudaFree(d_dst); + + return res; +} + +bool test_ptx_ld_half(bool global_mem) { + const int N = 4; + half h_src[N] = {__float2half(1.5f), __float2half(2.0f), __float2half(2.5f), + __float2half(3.0f)}; + half h_dst[N] = {0}; + + half *d_src = nullptr; + half *d_dst = nullptr; + + cudaMalloc(&d_src, N * sizeof(half)); + cudaMalloc(&d_dst, N * sizeof(half)); + + cudaMemcpy(d_src, h_src, N * sizeof(half), cudaMemcpyHostToDevice); + cudaMemset(d_dst, 0, N * sizeof(half)); + + if (global_mem) { + test_kernel_ld_global_b16<<<1, N>>>(reinterpret_cast(d_src), + reinterpret_cast(d_dst)); + } else { + test_kernel_ld_shared_b16<<<1, N>>>(reinterpret_cast(d_src), + reinterpret_cast(d_dst), N); + } + + cudaDeviceSynchronize(); + + cudaMemcpy(h_dst, d_dst, N * sizeof(bf16), cudaMemcpyDeviceToHost); + + bool res = true; + for (int i = 0; i < N; ++i) { + if (h_dst[i] != h_src[i]) { + res = false; + printf("error: h_dst[%d] = %f (expected %f)\n", i, __half2float(h_dst[i]), + __half2float(h_src[i])); + } + printf("h_dst[%d] = %f (expected %f)\n", i, __half2float(h_dst[i]), + __half2float(h_src[i])); + } + + cudaFree(d_src); + cudaFree(d_dst); + + return res; +} + +__device__ uint32_t ptx_ld_global_b32(uint32_t *src) { + uint32_t dst; + uint64_t addr = reinterpret_cast(src); + asm volatile("ld.global.b32 %0, [%1];" : "=r"(dst) : "l"(addr)); + return dst; +} + +__device__ uint32_t ptx_ld_shared_b32(uint32_t *src) { + uint32_t dst; + uint64_t addr = static_cast(__cvta_generic_to_shared(src)); + asm volatile("ld.shared.b32 %0, [%1];" : "=r"(dst) : "l"(addr)); + return dst; +} + +__global__ void test_kernel_ld_global_b32(uint32_t *d_src, uint32_t *d_dst) { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + d_dst[idx] = ptx_ld_global_b32(&d_src[idx]); +} + + +__global__ void test_kernel_ld_shared_b32(uint32_t *d_src, uint32_t *d_dst, + int N) { + extern __shared__ uint32_t smem_b32[]; + + int idx = threadIdx.x + blockIdx.x * blockDim.x; + + // Copy data in global memory to shared memory + if (idx < N) { + smem_b32[threadIdx.x] = d_src[idx]; + } + + // Wait until the shared memory operatoins are done. + __syncthreads(); + + if (idx < N) { + d_dst[idx] = ptx_ld_shared_b32(&smem_b32[threadIdx.x]); + } +} + +inline __nv_bfloat162 bf16_pair(float a, float b) { + return __floats2bfloat162_rn(a, b); +} + +bool test_ptx_ld_bf16_2(bool global_mem) { + const int N = 4; + bf16_2 h_src[N] = {bf16_pair(1.0f, 2.0f), bf16_pair(3.0f, 4.0f), + bf16_pair(5.0f, 6.0f), bf16_pair(7.0f, 8.0f)}; + bf16_2 h_dst[N] = {bf16_pair(0.0f, 0.0f), bf16_pair(0.0f, 0.0f), + bf16_pair(0.0f, 0.0f), bf16_pair(0.0f, 0.0f)}; + + bf16_2 *d_src = nullptr; + bf16_2 *d_dst = nullptr; + + cudaMalloc(&d_src, N * sizeof(bf16_2)); + cudaMalloc(&d_dst, N * sizeof(bf16_2)); + + cudaMemcpy(d_src, h_src, N * sizeof(bf16_2), cudaMemcpyHostToDevice); + cudaMemset(d_dst, 0, N * sizeof(bf16_2)); + + if (global_mem) { + test_kernel_ld_global_b32<<<1, N>>>(reinterpret_cast(d_src), + reinterpret_cast(d_dst)); + } else { + test_kernel_ld_shared_b32<<<1, N>>>(reinterpret_cast(d_src), + reinterpret_cast(d_dst), N); + } + + cudaDeviceSynchronize(); + + cudaMemcpy(h_dst, d_dst, N * sizeof(bf16_2), cudaMemcpyDeviceToHost); + + bool res = true; + for (int i = 0; i < N; ++i) { + float src_x = __bfloat162float(h_src[i].x); + float src_y = __bfloat162float(h_src[i].y); + float dst_x = __bfloat162float(h_dst[i].x); + float dst_y = __bfloat162float(h_dst[i].y); + if ((src_x != dst_x) && (src_y != dst_y)) { + res = false; + printf("error: h_dst[%d] = (%f, %f) (expected (%f, %f))\n", i, dst_x, + dst_y, dst_x, dst_y); + } + printf("h_dst[%d] = (%f, %f) (expected (%f, %f))\n", i, dst_x, dst_y, + dst_x, dst_y); + } + + cudaFree(d_src); + cudaFree(d_dst); + + return res; +} + +inline __half2 half_pair(float a, float b) { return __floats2half2_rn(a, b); } + +bool test_ptx_ld_half_2(bool global_mem) { + const int N = 4; + half_2 h_src[N] = {half_pair(1.0f, 2.0f), half_pair(3.0f, 4.0f), + half_pair(5.0f, 6.0f), half_pair(7.0f, 8.0f)}; + half_2 h_dst[N] = {half_pair(0.0f, 0.0f), half_pair(0.0f, 0.0f), + half_pair(0.0f, 0.0f), half_pair(0.0f, 0.0f)}; + + half_2 *d_src = nullptr; + half_2 *d_dst = nullptr; + + cudaMalloc(&d_src, N * sizeof(half_2)); + cudaMalloc(&d_dst, N * sizeof(half_2)); + + cudaMemcpy(d_src, h_src, N * sizeof(half_2), cudaMemcpyHostToDevice); + cudaMemset(d_dst, 0, N * sizeof(half_2)); + + if (global_mem) { + test_kernel_ld_global_b32<<<1, N>>>(reinterpret_cast(d_src), + reinterpret_cast(d_dst)); + } else { + test_kernel_ld_shared_b32<<<1, N>>>(reinterpret_cast(d_src), + reinterpret_cast(d_dst), N); + } + + cudaDeviceSynchronize(); + + cudaMemcpy(h_dst, d_dst, N * sizeof(half_2), cudaMemcpyDeviceToHost); + + bool res = true; + for (int i = 0; i < N; ++i) { + float src_x = __half2float(h_src[i].x); + float src_y = __half2float(h_src[i].y); + float dst_x = __half2float(h_dst[i].x); + float dst_y = __half2float(h_dst[i].y); + if ((src_x != dst_x) && (src_y != dst_y)) { + res = false; + printf("error: h_dst[%d] = (%f, %f) (expected (%f, %f))\n", i, dst_x, + dst_y, dst_x, dst_y); + } + printf("h_dst[%d] = (%f, %f) (expected (%f, %f))\n", i, dst_x, dst_y, dst_x, + dst_y); + } + + cudaFree(d_src); + cudaFree(d_dst); + + return res; +} + + __device__ uint32_t ptx_ld_global_f32(float * src) { + float dst; + asm volatile("ld.global.f32 %0, [%1];" : "=f"(dst) : "l"(src)); + return dst; + } + + __device__ uint32_t ptx_ld_shared_f32(float *src) { + float dst; + uint64_t addr = static_cast(__cvta_generic_to_shared(src)); + asm volatile("ld.shared.f32 %0, [%1];" : "=f"(dst) : "l"(addr)); + return dst; +} + +__global__ void test_kernel_ld_global_f32(float * d_src, float * d_dst) { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + d_dst[idx] = ptx_ld_global_f32(&d_src[idx]); +} + +__global__ void test_kernel_ld_shared_f32(float *d_src, float *d_dst, int N) { + extern __shared__ float smem_f32[]; + + int idx = threadIdx.x + blockIdx.x * blockDim.x; + + // Copy data in global memory to shared memory + if (idx < N) { + smem_f32[threadIdx.x] = d_src[idx]; + } + + // Wait until the shared memory operatoins are done. + __syncthreads(); + + if (idx < N) { + d_dst[idx] = ptx_ld_shared_f32(&smem_f32[threadIdx.x]); + } +} + +bool test_ptx_ld_f32(bool global_mem) { + const int N = 4; + float h_src[N] = {1.0f, 2.0f, 3.0f, 4.0f}; + float h_dst[N] = {0}; + + float *d_src = nullptr; + float *d_dst = nullptr; + + cudaMalloc(&d_src, N * sizeof(float)); + cudaMalloc(&d_dst, N * sizeof(float)); + + cudaMemcpy(d_src, h_src, N * sizeof(float), cudaMemcpyHostToDevice); + cudaMemset(d_dst, 0, N * sizeof(float)); + + if (global_mem) { + test_kernel_ld_global_f32<<<1, N>>>(reinterpret_cast(d_src), + reinterpret_cast(d_dst)); + } else { + test_kernel_ld_shared_f32<<<1, N>>>(reinterpret_cast(d_src), + reinterpret_cast(d_dst), + N); + } + + cudaDeviceSynchronize(); + + cudaMemcpy(h_dst, d_dst, N * sizeof(float), cudaMemcpyDeviceToHost); + + bool res = true; + for (int i = 0; i < N; ++i) { + if (h_dst[i] != h_src[i]) { + res = false; + printf("error: h_dst[%d] = %f (expected %f)\n", i, h_dst[i], h_src[i]); + } + printf("h_dst[%d] = %f (expected %f)\n", i, h_dst[i], h_src[i]); + } + cudaFree(d_src); + cudaFree(d_dst); + + return res; + } + +/* +// ld.v2 and ld.v4 is not supported by auto migration yet. +// When they are support, will uncomment this part. + + __device__ void ptx_ld_global_v2_f32(const float2 *src, float2 &dst) { + asm volatile("ld.global.v2.f32 {%0, %1}, [%2];" + : "=f"(dst.x), "=f"(dst.y) + : "l"(src)); + } + + __device__ void ptx_ld_shared_v2_f32(const float2 *src, float2 &dst) { + uint64_t addr = static_cast(__cvta_generic_to_shared(src)); + asm volatile("ld.shared.v2.f32 {%0, %1}, [%2];" + : "=f"(dst.x), "=f"(dst.y) + : "l"(addr)); + } + + __global__ void test_kernel_ld_global_v2_f32(const float2 *d_src, float2 *d_dst) { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + ptx_ld_global_v2_f32(&d_src[idx], d_dst[idx]); + } + + __global__ void test_kernel_ld_shared_v2_f32(const float2 *d_src, float2 *d_dst, int N) { + extern __shared__ float2 smem_v2_f32[]; + int idx = threadIdx.x + blockIdx.x * blockDim.x; + + // Copy data in global memory to shared memory + if (idx < N) { + smem_v2_f32[threadIdx.x] = d_src[idx]; + } + + // Wait until the shared memory operatoins are done. + __syncthreads(); + + if (idx < N) { + ptx_ld_shared_v2_f32(&smem_v2_f32[threadIdx.x], d_dst[idx]); + } + } + + bool test_ptx_ld_v2_f32(bool global_mem) { + const int N = 4; + float2 h_src[N] = {{1.1f, 2.2f}, {3.3f, 4.4f}, {5.5f, 6.6f}, {7.7f, 8.8f}}; + float2 h_dst[N]; + + float2 *d_src = nullptr; + float2 *d_dst = nullptr; + + cudaMalloc(&d_src, N * sizeof(float2)); + cudaMalloc(&d_dst, N * sizeof(float2)); + + cudaMemcpy(d_src, h_src, N * sizeof(float2), cudaMemcpyHostToDevice); + cudaMemset(d_dst, 0, N * sizeof(float2)); + + if (global_mem) { + test_kernel_ld_global_v2_f32<<<1, N>>>( + reinterpret_cast(d_src), + reinterpret_cast(d_dst)); + } else { + test_kernel_ld_shared_v2_f32<<<1, N>>>( + reinterpret_cast(d_src), + reinterpret_cast(d_dst), + N); + } + cudaDeviceSynchronize(); + + cudaMemcpy(h_dst, d_dst, N * sizeof(float2), cudaMemcpyDeviceToHost); + + bool res = true; + for (int i = 0; i < N; ++i) { + float src_x = h_src[i].x; + float src_y = h_src[i].y; + float dst_x = h_dst[i].x; + float dst_y = h_dst[i].y; + if ((src_x != dst_x) && (src_y != dst_y)) { + res = false; + printf("error: h_dst[%d] = (%f, %f) (expected (%f, %f))\n", i, dst_x, + dst_y, src_x, src_y); + } + printf("h_dst[%d] = (%f, %f) (expected (%f, %f))\n", i, dst_x, dst_y, + src_x, src_y); + } + + cudaFree(d_src); + cudaFree(d_dst); + + return res; + } + + __device__ void ptx_ld_global_v4_f32(const float4 *src, float4 &dst) { + asm volatile("ld.global.v4.f32 {%0, %1, %2, %3}, [%4];\n" + : "=f"(dst.x), "=f"(dst.y), "=f"(dst.z), "=f"(dst.w) + : "l"(src)); + } + + __device__ void ptx_ld_shared_v4_f32(const float4 *src, float4 &dst) { + uint64_t addr = static_cast(__cvta_generic_to_shared(src)); + asm volatile("ld.shared.v4.f32 {%0, %1, %2, %3}, [%4];\n" + : "=f"(dst.x), "=f"(dst.y), "=f"(dst.z), "=f"(dst.w) + : "l"(addr)); + } + + __global__ void test_kernel_ld_global_v4_f32(const float4 *d_src, float4 *d_dst) { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + ptx_ld_global_v4_f32(&d_src[idx], d_dst[idx]); + } + + __global__ void test_kernel_ld_shared_v4_f32(const float4 *d_src, float4 *d_dst, int N) { + extern __shared__ float4 smem_v4_f32[]; + int idx = threadIdx.x + blockIdx.x * blockDim.x; + + // Copy data in global memory to shared memory + if (idx < N) { + smem_v4_f32[threadIdx.x] = d_src[idx]; + } + + // Wait until the shared memory operatoins are done. + __syncthreads(); + + if (idx < N) { + ptx_ld_shared_v4_f32(&smem_v4_f32[threadIdx.x], d_dst[idx]); + } + } + +bool test_ptx_ld_v4_f32(bool global_mem) { + const int N = 4; + float4 h_src[N] = {{1.1f, 2.2f, 3.3f, 4.4f}, + {5.5f, 6.6f, 7.7f, 8.8f}, + {9.9f, 10.1f, 11.11f, 12.12f}, + {13.13f, 14.14f, 15.15f, 16.16f}}; + float4 h_dst[N]; + + float4 *d_src = nullptr; + float4 *d_dst = nullptr; + + cudaMalloc(&d_src, N * sizeof(float4)); + cudaMalloc(&d_dst, N * sizeof(float4)); + + cudaMemcpy(d_src, h_src, N * sizeof(float4), cudaMemcpyHostToDevice); + cudaMemset(d_dst, 0, N * sizeof(float4)); + + + if (global_mem) { + test_kernel_ld_global_v4_f32<<<1, N>>>( + reinterpret_cast(d_src), + reinterpret_cast(d_dst)); + } else { + test_kernel_ld_shared_v4_f32<<<1, N>>>( + reinterpret_cast(d_src), + reinterpret_cast(d_dst), N); + } + + cudaDeviceSynchronize(); + + cudaMemcpy(h_dst, d_dst, N * sizeof(float4), cudaMemcpyDeviceToHost); + + bool res = true; + for (int i = 0; i < N; ++i) { + float src_x = h_src[i].x; + float src_y = h_src[i].y; + float src_z = h_src[i].z; + float src_w = h_src[i].w; + float dst_x = h_dst[i].x; + float dst_y = h_dst[i].y; + float dst_z = h_dst[i].z; + float dst_w = h_dst[i].w; + if ((src_x != dst_x) && (src_y != dst_y) && (src_z != dst_z) && + (src_w != dst_w)) { + res = false; + printf("error: h_dst[%d] = (%f, %f, %f, %f) (expected (%f, %f, %f, " + "%f))\n", + i, dst_x, dst_y, dst_z, dst_w, src_x, src_y, src_z, src_w); + } + printf("h_dst[%d] = (%f, %f, %f, %f) (expected (%f, %f, %f, %f))\n", i, dst_x, dst_y, + dst_x, dst_y, src_x, src_y, src_z, src_w); + } + + cudaFree(d_src); + cudaFree(d_dst); + + return res; + } +*/ int main() { - TEST(test_1); - TEST(test_2); - TEST(test_3); + TEST(test_1); + TEST(test_2); + TEST(test_3); + +#define GLOBAL_MEM true +#define SHARED_MEM false + + TEST(test_ptx_ld_bf16(GLOBAL_MEM)); + TEST(test_ptx_ld_bf16(SHARED_MEM)); + TEST(test_ptx_ld_half(GLOBAL_MEM)); + TEST(test_ptx_ld_half(SHARED_MEM)); + TEST(test_ptx_ld_bf16_2(GLOBAL_MEM)); + TEST(test_ptx_ld_bf16_2(SHARED_MEM)); + TEST(test_ptx_ld_half_2(GLOBAL_MEM)); + TEST(test_ptx_ld_half_2(SHARED_MEM)); + TEST(test_ptx_ld_f32(GLOBAL_MEM)); + TEST(test_ptx_ld_f32(SHARED_MEM)); + /* TEST(test_ptx_ld_v2_f32(GLOBAL_MEM)); + TEST(test_ptx_ld_v2_f32(SHARED_MEM)); + TEST(test_ptx_ld_v4_f32(GLOBAL_MEM)); + TEST(test_ptx_ld_v4_f32(SHARED_MEM));*/ + return 0; } \ No newline at end of file