lectures/lecture_012/main.cu at main · gpu-mode/lectures · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#include <cstdio>
#include <cmath>
#include <cuda_runtime.h>

#define CUDA_CHECK(err) \
    if (err != cudaSuccess) { \
        fprintf(stderr, "CUDA Error: %s (line %d)\n", cudaGetErrorString(err), __LINE__); \
        exit(EXIT_FAILURE); \
    }

// Declare your kernels (must match your .cu definitions)
extern "C" __global__ void flash_attention_spilling_from_registers_k(
    float *out,
    float *out_l,
    float *Q,
    float *K,
    float *V,
    float scaling,
    int n,
    int T_r,
    int T_c
);

extern "C" __global__ void flash_attention_k(
    float *out,
    float *out_l,
    float *Q,
    float *K,
    float *V,
    float scaling,
    int n,
    int T_r,
    int T_c
);

inline unsigned int cdiv(unsigned int a, unsigned int b) { return (a + b - 1) / b; }

int main() {
    // Problem sizes (small enough to test, big enough for profiling)
    int n = 128;       // number of rows in Q
    int n_inp = 128;   // number of rows in K and V
    int d = 64;        // feature dimension

    int B_r = 16;      // block tile rows
    int B_c = 16;      // block tile cols
    int block_dim_x = 32;
    int block_dim_y = 16;

    // Host allocations
    size_t size_Q = n * d * sizeof(float);
    size_t size_K = n_inp * d * sizeof(float);
    size_t size_V = n_inp * d * sizeof(float);
    size_t size_out = n * d * sizeof(float);
    size_t size_out_l = n * sizeof(float);

    float *h_Q = new float[n * d];
    float *h_K = new float[n_inp * d];
    float *h_V = new float[n_inp * d];

    // Initialize with some data
    for (int i = 0; i < n * d; i++) h_Q[i] = static_cast<float>(i % 13) * 0.1f;
    for (int i = 0; i < n_inp * d; i++) {
        h_K[i] = static_cast<float>((i % 7) * 0.2f);
        h_V[i] = static_cast<float>((i % 5) * 0.3f);
    }

    // Device allocations
    float *d_Q, *d_K, *d_V, *d_out, *d_out_l;
    CUDA_CHECK(cudaMalloc(&d_Q, size_Q));
    CUDA_CHECK(cudaMalloc(&d_K, size_K));
    CUDA_CHECK(cudaMalloc(&d_V, size_V));
    CUDA_CHECK(cudaMalloc(&d_out, size_out));
    CUDA_CHECK(cudaMalloc(&d_out_l, size_out_l));

    // Copy data to device
    CUDA_CHECK(cudaMemcpy(d_Q, h_Q, size_Q, cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_K, h_K, size_K, cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_V, h_V, size_V, cudaMemcpyHostToDevice));

    float scaling = 1.0f / sqrtf(static_cast<float>(d));
    int T_r = cdiv(n, B_r);
    int T_c = cdiv(n_inp, B_c);

    dim3 blocks(1, 1);
    dim3 threads(block_dim_x, block_dim_y);

    printf("Launching kernel...\n");

    // Kernel launch
    flash_attention_spilling_from_registers_k<<<blocks, threads>>>(
        d_out,
        d_out_l,
        d_Q,
        d_K,
        d_V,
        scaling,
        n,
        T_r,
        T_c
    );
    CUDA_CHECK(cudaGetLastError());
    CUDA_CHECK(cudaDeviceSynchronize());

    printf("Kernel finished.\n");

    printf("Launching kernel...\n");

    // Kernel launch
    flash_attention_k<<<blocks, threads>>>(
        d_out,
        d_out_l,
        d_Q,
        d_K,
        d_V,
        scaling,
        n,
        T_r,
        T_c
    );
    CUDA_CHECK(cudaGetLastError());
    CUDA_CHECK(cudaDeviceSynchronize());

    printf("Kernel finished.\n");

    // Cleanup
    delete[] h_Q;
    delete[] h_K;
    delete[] h_V;
    CUDA_CHECK(cudaFree(d_Q));
    CUDA_CHECK(cudaFree(d_K));
    CUDA_CHECK(cudaFree(d_V));
    CUDA_CHECK(cudaFree(d_out));
    CUDA_CHECK(cudaFree(d_out_l));

    return 0;
}