-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathld_matrix_x1.cu
More file actions
42 lines (34 loc) · 1.1 KB
/
ld_matrix_x1.cu
File metadata and controls
42 lines (34 loc) · 1.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#include <cstdint>
#include <iostream>
__device__ __forceinline__ void ldmatrix_sync_aligned_m8n8_x1_b16(
uint32_t &d0, const uint32_t &address) {
asm volatile("ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%0}, [%1];"
: "=r"(d0)
: "r"(address));
}
__global__ void ldmatrix(uint16_t *value) {
constexpr int N = 64;
__shared__ uint16_t smem[N];
auto tid = threadIdx.x;
const uint32_t offset_rows = sizeof(uint16_t) * (tid % 8) * 8;
const uint32_t address = __cvta_generic_to_shared(smem) + offset_rows;
for (uint32_t i = tid; i < N; i += blockDim.x) {
smem[i] = i;
}
__syncthreads();
uint32_t frag;
ldmatrix_sync_aligned_m8n8_x1_b16(frag, address);
__syncthreads();
uint16_t number1 = static_cast<uint16_t>(frag & 0xFFFF);
uint16_t number2 = static_cast<uint16_t>((frag >> 16) & 0xFFFF);
printf("%d -> %d %d %d \n", tid, (int)(smem[2 * tid]), (int)number1,
(int)number2);
}
int main() {
uint16_t *d_value;
cudaMalloc(&d_value, sizeof(uint16_t));
ldmatrix<<<1, 32>>>(d_value);
cudaDeviceSynchronize();
cudaFree(d_value);
return 0;
}