Skip to content

Commit 438a077

Browse files
committed
WIP: ggml-cuda: Add bf16 cuda support to fattn (Flash Attention)
1 parent 25ff6f7 commit 438a077

File tree

56 files changed

+1828
-76
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

56 files changed

+1828
-76
lines changed

examples/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ else()
3434
add_subdirectory(gen-docs)
3535
add_subdirectory(training)
3636
add_subdirectory(diffusion)
37+
add_subdirectory(sweep-bench)
3738
if (NOT GGML_BACKEND_DL)
3839
add_subdirectory(convert-llama2c-to-ggml)
3940
# these examples use the backends directly and cannot be built with dynamic loading

examples/cuda_p2p_bench.cpp

Whitespace-only changes.

examples/cuda_p2p_bench.h

Whitespace-only changes.

examples/sweep-bench/CMakeLists.txt

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
cmake_minimum_required(VERSION 3.10)
2+
project(llama_sweep_bench)
3+
4+
set(TARGET llama-sweep-bench)
5+
6+
find_package(CUDAToolkit REQUIRED)
7+
8+
add_executable(${TARGET}
9+
sweep-bench.cpp
10+
cuda_p2p_bench.cpp
11+
nccl_allreduce_bench.cpp
12+
nccl_sendrecv_bench.cpp
13+
)
14+
15+
# Inherit CUDA settings from parent; no need to enable_language(CUDA) or set includes
16+
17+
install(TARGETS ${TARGET} RUNTIME DESTINATION bin)
18+
19+
if(TARGET CUDA::cudart_static)
20+
target_link_libraries(${TARGET} PRIVATE common llama CUDA::cudart_static nccl)
21+
elseif(DEFINED CUDA_CUDART_LIBRARY)
22+
target_link_libraries(${TARGET} PRIVATE common llama ${CUDA_CUDART_LIBRARY} nccl)
23+
else()
24+
message(FATAL_ERROR "Could not find CUDA runtime library target or variable. Please ensure CUDA is installed and CMake can find cudart_static or CUDA_CUDART_LIBRARY.")
25+
endif()
26+
27+
find_library(NCCL_LIBRARY nccl REQUIRED)
28+
target_link_libraries(${TARGET} PRIVATE ${NCCL_LIBRARY})

examples/sweep-bench/README.md

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# llama.cpp/tools/sweep-bench
2+
3+
Benchmark the prompt processing and token generation performance of `llama.cpp`
4+
by doing a sweep over a whole context size and gathering performance metrics
5+
in each ubatch-sized window. Only a single token sequence is used.
6+
7+
The benchmark steps are:
8+
9+
for each ubatch-sized window in context:
10+
11+
1. generate ubatch/4 tokens (not the whole window to save some time)
12+
2. measure generation performance
13+
3. prepare a ubatch-sized batch of random tokens
14+
4. process prepared batch
15+
5. measure prompt processing performance
16+
17+
The purpose of the benchmark is to visualize how the performance changes with
18+
the context size without averaging the metrics values over the whole context.
19+
20+
## Usage
21+
22+
```bash
23+
./llama-sweep-bench -c 8704 -ub 512 -m models/Meta-Llama-3.2-3B-Instruct-Q8_0.gguf
24+
```
25+
26+
## Sample results
27+
28+
- `PP` - prompt tokens per ubatch
29+
- `TG` - generated tokens per ubatch
30+
- `N_KV` - current KV cache size
31+
- `T_PP` - prompt processing time (i.e. time to first token)
32+
- `S_PP` - prompt processing speed (`(B*PP)/T_PP` or `PP/T_PP`)
33+
- `T_TG` - time to generate all batches
34+
- `S_TG` - text generation speed (`(B*TG)/T_TG`)
35+
36+
| PP | TG | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s |
37+
|-------|--------|--------|----------|----------|----------|----------|
38+
| 1024 | 256 | 0 | 375.321 | 2.73 | 94.977 | 2.70 |
39+
| 1024 | 256 | 1024 | 416.327 | 2.46 | 113.177 | 2.26 |
40+
41+
### JSONL output
42+
43+
Pass `--batched-bench-output-jsonl` to output JSONL instead of Markdown, á la
44+
45+
```json lines
46+
{"n_kv_max": 2048, "n_batch": 2048, "n_ubatch": 1024, "flash_attn": 1, "n_gpu_layers": 99, "n_threads": 48, "n_threads_batch": 48, "pp": 1024, "tg": 256, "n_kv": 0, "t_pp": 375.321000, "speed_pp": 2.730000, "t_tg": 94.977000, "speed_tg": 2.700000 }
47+
{"n_kv_max": 2048, "n_batch": 2048, "n_ubatch": 1024, "flash_attn": 1, "n_gpu_layers": 99, "n_threads": 48, "n_threads_batch": 48, "pp": 1024, "tg": 256, "n_kv": 1024, "t_pp": 416.327000, "speed_pp": 2.460000, "t_tg": 113.177000, "speed_tg": 2.260000 }
48+
```
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
#include "cuda_p2p_bench.h"
2+
#include <cstdio>
3+
#include <cuda_runtime.h>
4+
#include <thread>
5+
#include <algorithm>
6+
7+
void run_cuda_p2p_bench() {
8+
int num_devices = 0;
9+
cudaGetDeviceCount(&num_devices);
10+
if (num_devices < 2) {
11+
printf("Need at least 2 GPUs for CUDA P2P test.\n");
12+
return;
13+
}
14+
printf("CUDA P2P communication paths (PCI bus IDs):\n");
15+
for (int i = 0; i < num_devices; ++i) {
16+
cudaDeviceProp prop_i;
17+
cudaGetDeviceProperties(&prop_i, i);
18+
for (int j = 0; j < num_devices; ++j) {
19+
if (i == j) continue;
20+
cudaDeviceProp prop_j;
21+
cudaGetDeviceProperties(&prop_j, j);
22+
int can_access = 0;
23+
cudaDeviceCanAccessPeer(&can_access, i, j);
24+
printf(" GPU %d (%04x:%02x:%02x) <-> GPU %d (%04x:%02x:%02x) : P2P %s\n",
25+
i, prop_i.pciDomainID, prop_i.pciBusID, prop_i.pciDeviceID,
26+
j, prop_j.pciDomainID, prop_j.pciBusID, prop_j.pciDeviceID,
27+
can_access ? "ENABLED" : "DISABLED");
28+
}
29+
}
30+
// Enable peer access
31+
for (int i = 0; i < num_devices; ++i) {
32+
cudaSetDevice(i);
33+
for (int j = 0; j < num_devices; ++j) {
34+
if (i != j) cudaDeviceEnablePeerAccess(j, 0);
35+
}
36+
}
37+
// Bandwidth and bi-directional bandwidth test for each pair
38+
size_t size = 1024 * 1024 * 1024; // 64 MB
39+
const int num_runs = 100;
40+
for (int i = 0; i < num_devices; ++i) {
41+
for (int j = 0; j < num_devices; ++j) {
42+
if (i == j) continue;
43+
cudaSetDevice(i);
44+
void *src, *dst;
45+
cudaMalloc(&src, size);
46+
cudaSetDevice(j);
47+
cudaMalloc(&dst, size);
48+
cudaStream_t stream;
49+
cudaStreamCreate(&stream);
50+
// Bandwidth test (j -> i)
51+
float total_ms = 0.0f;
52+
for (int run = 0; run < num_runs; ++run) {
53+
cudaEvent_t start, stop;
54+
cudaEventCreate(&start);
55+
cudaEventCreate(&stop);
56+
cudaEventRecord(start, stream);
57+
cudaMemcpyPeerAsync(dst, j, src, i, size, stream);
58+
cudaEventRecord(stop, stream);
59+
cudaStreamSynchronize(stream);
60+
float ms = 0.0f;
61+
cudaEventElapsedTime(&ms, start, stop);
62+
total_ms += ms;
63+
cudaEventDestroy(start);
64+
cudaEventDestroy(stop);
65+
}
66+
float avg_ms = total_ms / num_runs;
67+
float bandwidth = (float)size / (avg_ms * 1e6f); // GB/s
68+
printf("GPU %d <-> GPU %d: P2P access ENABLED\n Bandwidth GPU %d -> GPU %d: %.2f GB/s (avg over %d runs)\n",
69+
i, j, j, i, bandwidth, num_runs);
70+
// Bi-directional bandwidth test (true parallel)
71+
cudaStream_t stream0, stream1;
72+
cudaSetDevice(i);
73+
cudaStreamCreate(&stream0);
74+
cudaSetDevice(j);
75+
cudaStreamCreate(&stream1);
76+
float total_bi_ms = 0.0f;
77+
for (int run = 0; run < num_runs; ++run) {
78+
cudaEvent_t start0, stop0, start1, stop1;
79+
cudaSetDevice(i);
80+
cudaEventCreate(&start0);
81+
cudaEventCreate(&stop0);
82+
cudaSetDevice(j);
83+
cudaEventCreate(&start1);
84+
cudaEventCreate(&stop1);
85+
cudaSetDevice(i);
86+
cudaEventRecord(start0, stream0);
87+
cudaSetDevice(j);
88+
cudaEventRecord(start1, stream1);
89+
// Launch both directions in parallel
90+
std::thread t0([&]() {
91+
cudaSetDevice(i);
92+
cudaMemcpyPeerAsync(dst, j, src, i, size, stream0);
93+
cudaEventRecord(stop0, stream0);
94+
});
95+
std::thread t1([&]() {
96+
cudaSetDevice(j);
97+
cudaMemcpyPeerAsync(src, i, dst, j, size, stream1);
98+
cudaEventRecord(stop1, stream1);
99+
});
100+
t0.join();
101+
t1.join();
102+
cudaSetDevice(i);
103+
cudaStreamSynchronize(stream0);
104+
cudaSetDevice(j);
105+
cudaStreamSynchronize(stream1);
106+
float ms0 = 0.0f, ms1 = 0.0f;
107+
cudaEventElapsedTime(&ms0, start0, stop0);
108+
cudaEventElapsedTime(&ms1, start1, stop1);
109+
float ms = std::max(ms0, ms1);
110+
total_bi_ms += ms;
111+
cudaSetDevice(i);
112+
cudaEventDestroy(start0);
113+
cudaEventDestroy(stop0);
114+
cudaSetDevice(j);
115+
cudaEventDestroy(start1);
116+
cudaEventDestroy(stop1);
117+
}
118+
float avg_bi_ms = total_bi_ms / num_runs;
119+
float bi_bandwidth = 2.0f * (float)size / (avg_bi_ms * 1e6f); // GB/s
120+
printf(" Bi-directional bandwidth GPU %d <-> GPU %d: %.2f GB/s (avg over %d runs)\n",
121+
i, j, bi_bandwidth, num_runs);
122+
cudaStreamDestroy(stream);
123+
cudaSetDevice(i);
124+
cudaStreamDestroy(stream0);
125+
cudaFree(src);
126+
cudaSetDevice(j);
127+
cudaStreamDestroy(stream1);
128+
cudaFree(dst);
129+
}
130+
}
131+
}

examples/sweep-bench/cuda_p2p_bench.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
#pragma once
2+
void run_cuda_p2p_bench();
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
#include "nccl_allreduce_bench.h"
2+
#include <cstdio>
3+
#include <vector>
4+
#include <thread>
5+
#include <cuda_runtime.h>
6+
#include <nccl.h>
7+
8+
void run_nccl_allreduce_bench() {
9+
int num_devices = 0;
10+
cudaGetDeviceCount(&num_devices);
11+
if (num_devices < 2) {
12+
printf("Need at least 2 GPUs for NCCL AllReduce test.\n");
13+
return;
14+
}
15+
printf("Warming up NCCL communication on all GPUs...\n");
16+
size_t size = 1024 * 1024 * 1024; // 1 GB
17+
std::vector<void*> sendbuffs(num_devices), recvbuffs(num_devices);
18+
std::vector<cudaStream_t> streams(num_devices);
19+
for (int i = 0; i < num_devices; ++i) {
20+
cudaSetDevice(i);
21+
cudaMalloc(&sendbuffs[i], size);
22+
cudaMalloc(&recvbuffs[i], size);
23+
cudaStreamCreate(&streams[i]);
24+
}
25+
std::vector<ncclComm_t> comms(num_devices);
26+
std::vector<int> devs(num_devices);
27+
for (int i = 0; i < num_devices; ++i) devs[i] = i;
28+
ncclCommInitAll(comms.data(), num_devices, devs.data());
29+
const int num_runs = 100;
30+
std::vector<float> total_ms(num_devices, 0.0f);
31+
for (int run = 0; run < num_runs; ++run) {
32+
std::vector<std::thread> threads;
33+
std::vector<cudaEvent_t> starts(num_devices), stops(num_devices);
34+
for (int i = 0; i < num_devices; ++i) {
35+
cudaSetDevice(i);
36+
cudaEventCreate(&starts[i]);
37+
cudaEventCreate(&stops[i]);
38+
}
39+
for (int i = 0; i < num_devices; ++i) {
40+
threads.emplace_back([i, &comms, &sendbuffs, &recvbuffs, &streams, &starts, &stops, size]() {
41+
cudaSetDevice(i);
42+
cudaEventRecord(starts[i], streams[i]);
43+
ncclAllReduce(sendbuffs[i], recvbuffs[i], size / sizeof(float), ncclFloat, ncclSum, comms[i], streams[i]);
44+
cudaEventRecord(stops[i], streams[i]);
45+
cudaStreamSynchronize(streams[i]);
46+
});
47+
}
48+
for (auto& t : threads) t.join();
49+
for (int i = 0; i < num_devices; ++i) {
50+
float ms = 0.0f;
51+
cudaEventSynchronize(stops[i]);
52+
cudaEventElapsedTime(&ms, starts[i], stops[i]);
53+
total_ms[i] += ms;
54+
cudaEventDestroy(starts[i]);
55+
cudaEventDestroy(stops[i]);
56+
}
57+
}
58+
for (int i = 0; i < num_devices; ++i) {
59+
cudaFree(sendbuffs[i]);
60+
cudaFree(recvbuffs[i]);
61+
cudaStreamDestroy(streams[i]);
62+
float avg_ms = total_ms[i] / num_runs;
63+
float bandwidth = (float)size / (avg_ms * 1e6f); // GB/s
64+
printf("NCCL Bandwidth GPU %d: %.2f GB/s (avg over %d runs)\n", i, bandwidth, num_runs);
65+
ncclCommDestroy(comms[i]);
66+
}
67+
}
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
#pragma once
2+
void run_nccl_allreduce_bench();
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
#include "nccl_sendrecv_bench.h"
2+
#include <cstdio>
3+
#include <thread>
4+
#include <cuda_runtime.h>
5+
#include <nccl.h>
6+
#include <algorithm> // for std::max
7+
8+
void run_nccl_sendrecv_bench() {
9+
int num_devices = 0;
10+
cudaGetDeviceCount(&num_devices);
11+
if (num_devices < 2) {
12+
printf("Need at least 2 GPUs for NCCL Send/Recv test.\n");
13+
return;
14+
}
15+
size_t size = 1024 * 1024 * 1024; // 1 GB
16+
void *sendbuffs[2], *recvbuffs[2];
17+
cudaStream_t streams0[2], streams1[2];
18+
for (int i = 0; i < 2; ++i) {
19+
cudaSetDevice(i);
20+
cudaMalloc(&sendbuffs[i], size);
21+
cudaMalloc(&recvbuffs[i], size);
22+
cudaStreamCreate(&streams0[i]);
23+
cudaStreamCreate(&streams1[i]);
24+
}
25+
ncclComm_t comms[2];
26+
int devs[2] = {0, 1};
27+
ncclCommInitAll(comms, 2, devs);
28+
const int num_runs = 100;
29+
float total_ms[2] = {0.0f, 0.0f};
30+
for (int run = 0; run < num_runs; ++run) {
31+
float ms[2] = {0.0f, 0.0f};
32+
std::thread threads[2];
33+
for (int rank = 0; rank < 2; ++rank) {
34+
threads[rank] = std::thread([rank, &comms, &sendbuffs, &recvbuffs, &streams0, &streams1, size, &ms]() {
35+
cudaSetDevice(rank);
36+
cudaEvent_t start0, stop0, start1, stop1;
37+
cudaEventCreate(&start0);
38+
cudaEventCreate(&stop0);
39+
cudaEventCreate(&start1);
40+
cudaEventCreate(&stop1);
41+
cudaEventRecord(start0, streams0[rank]);
42+
cudaEventRecord(start1, streams1[rank]);
43+
ncclGroupStart();
44+
ncclSend(sendbuffs[rank], size / sizeof(float), ncclFloat, 1 - rank, comms[rank], streams0[rank]);
45+
ncclRecv(recvbuffs[rank], size / sizeof(float), ncclFloat, 1 - rank, comms[rank], streams1[rank]);
46+
ncclGroupEnd();
47+
cudaEventRecord(stop0, streams0[rank]);
48+
cudaEventRecord(stop1, streams1[rank]);
49+
cudaStreamSynchronize(streams0[rank]);
50+
cudaStreamSynchronize(streams1[rank]);
51+
float ms0 = 0.0f, ms1 = 0.0f;
52+
cudaEventElapsedTime(&ms0, start0, stop0);
53+
cudaEventElapsedTime(&ms1, start1, stop1);
54+
ms[rank] = std::max(ms0, ms1);
55+
cudaEventDestroy(start0);
56+
cudaEventDestroy(stop0);
57+
cudaEventDestroy(start1);
58+
cudaEventDestroy(stop1);
59+
});
60+
}
61+
threads[0].join();
62+
threads[1].join();
63+
total_ms[0] += ms[0];
64+
total_ms[1] += ms[1];
65+
}
66+
for (int i = 0; i < 2; ++i) {
67+
float avg_ms = total_ms[i] / num_runs;
68+
float bandwidth = 2.0f * (float)size / (avg_ms * 1e6f); // GB/s (bi-directional)
69+
printf("NCCL Send/Recv Bi-directional Bandwidth GPU %d: %.2f GB/s (avg over %d runs)\n", i, bandwidth, num_runs);
70+
cudaFree(sendbuffs[i]);
71+
cudaFree(recvbuffs[i]);
72+
cudaStreamDestroy(streams0[i]);
73+
cudaStreamDestroy(streams1[i]);
74+
ncclCommDestroy(comms[i]);
75+
}
76+
}

0 commit comments

Comments
 (0)