|
6 | 6 |
|
7 | 7 | // clang-format off
|
8 | 8 | #include <cuda_runtime.h>
|
| 9 | +#include <cuda_bf16.h> |
| 10 | +#include <stdint.h> |
| 11 | +#include <stdio.h> |
| 12 | + |
| 13 | +using bf16 = __nv_bfloat16; |
| 14 | +using bf16_2 = __nv_bfloat162; |
| 15 | +using half_2 = __half2; |
9 | 16 |
|
10 | 17 | /*
|
11 | 18 | .ss = { .const, .global, .local, .param, .shared };
|
@@ -93,4 +100,201 @@ __device__ __forceinline__ int ld_flag_acquire(int* flag_addr) {
|
93 | 100 | return flag;
|
94 | 101 | }
|
95 | 102 |
|
| 103 | + // CHECK: static inline void lds(bf16& dst, uint32_t src) { |
| 104 | + // CHECK-NEXT: *(uint16_t*)&dst = *((uint16_t *)(uintptr_t)src); |
| 105 | + // CHECK-NEXT: } |
| 106 | + __device__ static inline void lds(bf16& dst, uint32_t src) { |
| 107 | + asm volatile("ld.shared.b16 %0, [%1];" : "=h"(*(uint16_t*)&dst) : "r"(src)); |
| 108 | + } |
| 109 | + |
| 110 | +// CHECK: static inline void sts(uint32_t dst, const bf16& src) { |
| 111 | +// CHECK-NEXT: *((uint16_t *)(uintptr_t)dst) = *(uint16_t*)&src; |
| 112 | +// CHECK-NEXT: } |
| 113 | +__device__ static inline void sts(uint32_t dst, const bf16& src) { |
| 114 | + asm volatile("st.shared.b16 [%1], %0;\n" : : "h"(*(uint16_t*)&src), "r"(dst)); |
| 115 | +} |
| 116 | + |
| 117 | +// CHECK: static inline void ldg(bf16& dst, bf16* src) { |
| 118 | +// CHECK-NEXT: *(uint16_t*)&dst = *((uint16_t *)(uintptr_t)src); |
| 119 | +// CHECK-NEXT: } |
| 120 | +__device__ static inline void ldg(bf16& dst, bf16* src) { |
| 121 | + asm volatile("ld.global.b16 %0, [%1];\n" : "=h"(*(uint16_t*)&dst) : "l"(src)); |
| 122 | +} |
| 123 | + |
| 124 | +// CHECK: static inline void stg(bf16* dst, const bf16& src) { |
| 125 | +// CHECK-NEXT: *((uint16_t *)(uintptr_t)dst) = *(uint16_t*)&src; |
| 126 | +// CHECK-NEXT: } |
| 127 | +__device__ static inline void stg(bf16* dst, const bf16& src) { |
| 128 | + asm volatile("st.global.b16 [%1], %0;\n" : : "h"(*(uint16_t*)&src), "l"(dst)); |
| 129 | +} |
| 130 | + |
| 131 | +// CHECK: static inline void lds(sycl::half& dst, uint32_t src) { |
| 132 | +// CHECK-NEXT: *(uint16_t*)&dst = *((uint16_t *)(uintptr_t)src); |
| 133 | +// CHECK-NEXT: } |
| 134 | +__device__ static inline void lds(half& dst, uint32_t src) { |
| 135 | + asm volatile("ld.shared.b16 %0, [%1];\n" : "=h"(*(uint16_t*)&dst) : "r"(src)); |
| 136 | +} |
| 137 | + |
| 138 | +// CHECK: static inline void sts(uint32_t dst, const sycl::half& src) { |
| 139 | +// CHECK-NEXT: *((uint16_t *)(uintptr_t)dst) = *(uint16_t*)&src; |
| 140 | +// CHECK-NEXT: } |
| 141 | +__device__ static inline void sts(uint32_t dst, const half& src) { |
| 142 | + asm volatile("st.shared.b16 [%1], %0;\n" : : "h"(*(uint16_t*)&src), "r"(dst)); |
| 143 | +} |
| 144 | + |
| 145 | +// CHECK: static inline void ldg(sycl::half& dst, sycl::half* src) { |
| 146 | +// CHECK-NEXT: *(uint16_t*)&dst = *((uint16_t *)(uintptr_t)src); |
| 147 | +// CHECK-NEXT: } |
| 148 | +__device__ static inline void ldg(half& dst, half* src) { |
| 149 | + asm volatile("ld.global.b16 %0, [%1];\n" : "=h"(*(uint16_t*)&dst) : "l"(src)); |
| 150 | +} |
| 151 | + |
| 152 | +// CHECK: static inline void stg(sycl::half* dst, const sycl::half& src) { |
| 153 | +// CHECK-NEXT: *((uint16_t *)(uintptr_t)dst) = *(uint16_t*)&src; |
| 154 | +// CHECK-NEXT: } |
| 155 | +__device__ static inline void stg(half* dst, const half& src) { |
| 156 | + asm volatile("st.global.b16 [%1], %0;\n" : : "h"(*(uint16_t*)&src), "l"(dst)); |
| 157 | +} |
| 158 | + |
| 159 | +// CHECK: static inline void lds(float& dst, uint32_t src) { |
| 160 | +// CHECK-NEXT: dst = *((float *)(uintptr_t)src); |
| 161 | +// CHECK-NEXT: } |
| 162 | +__device__ static inline void lds(float& dst, uint32_t src) { |
| 163 | + asm volatile("ld.shared.f32 %0, [%1];\n" : "=f"(dst) : "r"(src)); |
| 164 | +} |
| 165 | + |
| 166 | +// CHECK: static inline void sts(uint32_t dst, const float& src) { |
| 167 | +// CHECK-NEXT: *((float *)(uintptr_t)dst) = src; |
| 168 | +// CHECK-NEXT: } |
| 169 | +__device__ static inline void sts(uint32_t dst, const float& src) { |
| 170 | + asm volatile("st.shared.f32 [%1], %0;\n" : : "f"(src), "r"(dst)); |
| 171 | +} |
| 172 | + |
| 173 | +// CHECK: static inline void ldg(float& dst, float* src) { |
| 174 | +// CHECK-NEXT: dst = *src; |
| 175 | +// CHECK-NEXT: } |
| 176 | +__device__ static inline void ldg(float& dst, float* src) { |
| 177 | + asm volatile("ld.global.f32 %0, [%1];\n" : "=f"(dst) : "l"(src)); |
| 178 | +} |
| 179 | + |
| 180 | +// CHECK: static inline void stg(float* dst, const float& src) { |
| 181 | +// CHECK-NEXT: *dst = src; |
| 182 | +// CHECK-NEXT: } |
| 183 | +__device__ static inline void stg(float* dst, const float& src) { |
| 184 | + asm volatile("st.global.f32 [%1], %0;\n" : : "f"(src), "l"(dst)); |
| 185 | +} |
| 186 | + |
| 187 | +// CHECK: static inline void lds(bf16_2& dst, uint32_t src) { |
| 188 | +// CHECK-NEXT: *(uint32_t*)&dst = *((uint32_t *)(uintptr_t)src); |
| 189 | +// CHECK-NEXT: } |
| 190 | +__device__ static inline void lds(bf16_2& dst, uint32_t src) { |
| 191 | + asm volatile("ld.shared.b32 %0, [%1];\n" : "=r"(*(uint32_t*)&dst) : "r"(src)); |
| 192 | +} |
| 193 | + |
| 194 | +// CHECK: static inline void sts(uint32_t dst, const bf16_2& src) { |
| 195 | +// CHECK-NEXT: *((uint32_t *)(uintptr_t)dst) = (*(uint32_t*)&src); |
| 196 | +// CHECK-NEXT: } |
| 197 | +__device__ static inline void sts(uint32_t dst, const bf16_2& src) { |
| 198 | + asm volatile("st.shared.b32 [%1], %0;\n" : : "r"(*(uint32_t*)&src), "r"(dst)); |
| 199 | +} |
| 200 | + |
| 201 | +// CHECK: static inline void ldg(bf16_2& dst, bf16_2* src) { |
| 202 | +// CHECK-NEXT: *(uint32_t*)&dst = *((uint32_t *)(uintptr_t)src); |
| 203 | +// CHECK-NEXT: } |
| 204 | +__device__ static inline void ldg(bf16_2& dst, bf16_2* src) { |
| 205 | + asm volatile("ld.global.b32 %0, [%1];\n" : "=r"(*(uint32_t*)&dst) : "l"(src)); |
| 206 | +} |
| 207 | + |
| 208 | +// CHECK: static inline void stg(bf16_2* dst, const bf16_2& src) { |
| 209 | +// CHECK-NEXT: *((uint32_t *)(uintptr_t)dst) = (*(uint32_t*)&src); |
| 210 | +// CHECK-NEXT: } |
| 211 | +__device__ static inline void stg(bf16_2* dst, const bf16_2& src) { |
| 212 | + asm volatile("st.global.b32 [%1], %0;\n" : : "r"(*(uint32_t*)&src), "l"(dst)); |
| 213 | +} |
| 214 | + |
| 215 | +// CHECK: static inline void lds(half_2& dst, uint32_t src) { |
| 216 | +// CHECK-NEXT: *(uint32_t*)&dst = *((uint32_t *)(uintptr_t)src); |
| 217 | +// CHECK-NEXT: } |
| 218 | +__device__ static inline void lds(half_2& dst, uint32_t src) { |
| 219 | + asm volatile("ld.shared.b32 %0, [%1];\n" : "=r"(*(uint32_t*)&dst) : "r"(src)); |
| 220 | +} |
| 221 | + |
| 222 | +// CHECK: static inline void sts(uint32_t dst, const half_2& src) { |
| 223 | +// CHECK-NEXT: *((uint32_t *)(uintptr_t)dst) = (*(uint32_t*)&src); |
| 224 | +// CHECK-NEXT: } |
| 225 | +__device__ static inline void sts(uint32_t dst, const half_2& src) { |
| 226 | + asm volatile("st.shared.b32 [%1], %0;\n" : : "r"(*(uint32_t*)&src), "r"(dst)); |
| 227 | +} |
| 228 | + |
| 229 | +// CHECK: static inline void ldg(half_2& dst, half_2* src) { |
| 230 | +// CHECK-NEXT: *(uint32_t*)&dst = *((uint32_t *)(uintptr_t)src); |
| 231 | +// CHECK-NEXT: } |
| 232 | +__device__ static inline void ldg(half_2& dst, half_2* src) { |
| 233 | + asm volatile("ld.global.b32 %0, [%1];\n" : "=r"(*(uint32_t*)&dst) : "l"(src)); |
| 234 | +} |
| 235 | + |
| 236 | +// CHECK: static inline void stg(half_2* dst, const half_2& src) { |
| 237 | +// CHECK-NEXT: *((uint32_t *)(uintptr_t)dst) = (*(uint32_t*)&src); |
| 238 | +// CHECK-NEXT: } |
| 239 | +__device__ static inline void stg(half_2* dst, const half_2& src) { |
| 240 | + asm volatile("st.global.b32 [%1], %0;\n" : : "r"(*(uint32_t*)&src), "l"(dst)); |
| 241 | +} |
| 242 | + |
| 243 | +// CHECK: static inline void lds(sycl::float2& dst, uint32_t src) { |
| 244 | +// CHECK-NEXT: {dst.x(), dst.y()} = *((float *)(uintptr_t)src); |
| 245 | +// CHECK-NEXT: } |
| 246 | +__device__ static inline void lds(float2& dst, uint32_t src) { |
| 247 | + asm volatile("ld.shared.v2.f32 {%0, %1}, [%2];\n" : "=f"(dst.x), "=f"(dst.y) : "r"(src)); |
| 248 | +} |
| 249 | + |
| 250 | +// CHECK: static inline void sts(uint32_t dst, const sycl::float2& src) { |
| 251 | +// CHECK-NEXT: *((float *)(uintptr_t)dst) = {src.x(), src.y()}; |
| 252 | +// CHECK-NEXT: } |
| 253 | +__device__ static inline void sts(uint32_t dst, const float2& src) { |
| 254 | + asm volatile("st.shared.v2.f32 [%2], {%0, %1};\n" : : "f"(src.x), "f"(src.y), "r"(dst)); |
| 255 | +} |
| 256 | + |
| 257 | +// CHECK: static inline void ldg(sycl::float2& dst, sycl::float2* src) { |
| 258 | +// CHECK-NEXT: {dst.x(), dst.y()} = *((float *)(uintptr_t)src); |
| 259 | +// CHECK-NEXT: } |
| 260 | +__device__ static inline void ldg(float2& dst, float2* src) { |
| 261 | + asm volatile("ld.global.v2.f32 {%0, %1}, [%2];\n" : "=f"(dst.x), "=f"(dst.y) : "l"(src)); |
| 262 | +} |
| 263 | + |
| 264 | +// CHECK: static inline void stg(sycl::float2* dst, const sycl::float2& src) { |
| 265 | +// CHECK-NEXT: *((float *)(uintptr_t)dst) = {src.x(), src.y()}; |
| 266 | +// CHECK-NEXT: } |
| 267 | +__device__ static inline void stg(float2* dst, const float2& src) { |
| 268 | + asm volatile("st.global.v2.f32 [%2], {%0, %1};\n" : : "f"(src.x), "f"(src.y), "l"(dst)); |
| 269 | +} |
| 270 | + |
| 271 | +// CHECK: static inline void lds(sycl::float4& dst, uint32_t src) { |
| 272 | +// CHECK-NEXT: {dst.x(), dst.y(), dst.z(), dst.w()} = *((float *)(uintptr_t)src); |
| 273 | +// CHECK-NEXT: } |
| 274 | +__device__ static inline void lds(float4& dst, uint32_t src) { |
| 275 | + asm volatile("ld.shared.v4.f32 {%0, %1, %2, %3}, [%4];\n" : "=f"(dst.x), "=f"(dst.y), "=f"(dst.z), "=f"(dst.w) : "r"(src)); |
| 276 | +} |
| 277 | + |
| 278 | +// CHECK: static inline void sts(uint32_t dst, const sycl::float4& src) { |
| 279 | +// CHECK-NEXT: *((float *)(uintptr_t)dst) = {src.x(), src.y(), src.z(), src.w()}; |
| 280 | +// CHECK-NEXT: } |
| 281 | +__device__ static inline void sts(uint32_t dst, const float4& src) { |
| 282 | + asm volatile("st.shared.v4.f32 [%4], {%0, %1, %2, %3};\n" : : "f"(src.x), "f"(src.y), "f"(src.z), "f"(src.w), "r"(dst)); |
| 283 | +} |
| 284 | + |
| 285 | +// CHECK: static inline void ldg(sycl::float4& dst, sycl::float4* src) { |
| 286 | +// CHECK-NEXT: {dst.x(), dst.y(), dst.z(), dst.w()} = *((float *)(uintptr_t)src); |
| 287 | +// CHECK-NEXT: } |
| 288 | +__device__ static inline void ldg(float4& dst, float4* src) { |
| 289 | + asm volatile("ld.global.v4.f32 {%0, %1, %2, %3}, [%4];\n" : "=f"(dst.x), "=f"(dst.y), "=f"(dst.z), "=f"(dst.w) : "l"(src)); |
| 290 | +} |
| 291 | + |
| 292 | +// CHECK: static inline void stg(sycl::float4* dst, const sycl::float4& src) { |
| 293 | +// CHECK-NEXT: *((float *)(uintptr_t)dst) = {src.x(), src.y(), src.z(), src.w()}; |
| 294 | +// CHECK-NEXT: } |
| 295 | +__device__ static inline void stg(float4* dst, const float4& src) { |
| 296 | + asm volatile("st.global.v4.f32 [%4], {%0, %1, %2, %3};\n" : : "f"(src.x), "f"(src.y), "f"(src.z), "f"(src.w), "l"(dst)); |
| 297 | +} |
| 298 | + |
| 299 | + |
96 | 300 | // clang-format on
|
0 commit comments