ggml-org
diff --git a/‎examples/CMakeLists.txt
Lines changed: 1 addition & 0 deletions b/‎examples/CMakeLists.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/cuda_p2p_bench.cpp b/‎examples/cuda_p2p_bench.cpp
diff --git a/‎examples/cuda_p2p_bench.h b/‎examples/cuda_p2p_bench.h
diff --git a/‎examples/sweep-bench/CMakeLists.txt
Lines changed: 28 additions & 0 deletions b/‎examples/sweep-bench/CMakeLists.txt
Lines changed: 28 additions & 0 deletions
diff --git a/‎examples/sweep-bench/README.md
Lines changed: 48 additions & 0 deletions b/‎examples/sweep-bench/README.md
Lines changed: 48 additions & 0 deletions
diff --git a/‎examples/sweep-bench/cuda_p2p_bench.cpp
Lines changed: 131 additions & 0 deletions b/‎examples/sweep-bench/cuda_p2p_bench.cpp
Lines changed: 131 additions & 0 deletions
diff --git a/‎examples/sweep-bench/cuda_p2p_bench.h
Lines changed: 2 additions & 0 deletions b/‎examples/sweep-bench/cuda_p2p_bench.h
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/sweep-bench/nccl_allreduce_bench.cpp
Lines changed: 67 additions & 0 deletions b/‎examples/sweep-bench/nccl_allreduce_bench.cpp
Lines changed: 67 additions & 0 deletions
diff --git a/‎examples/sweep-bench/nccl_allreduce_bench.h
Lines changed: 2 additions & 0 deletions b/‎examples/sweep-bench/nccl_allreduce_bench.h
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/sweep-bench/nccl_sendrecv_bench.cpp
Lines changed: 76 additions & 0 deletions b/‎examples/sweep-bench/nccl_sendrecv_bench.cpp
Lines changed: 76 additions & 0 deletions
@@ -34,6 +34,7 @@ else()
     add_subdirectory(gen-docs)
     add_subdirectory(training)
     add_subdirectory(diffusion)
+    add_subdirectory(sweep-bench)
     if (NOT GGML_BACKEND_DL)
         add_subdirectory(convert-llama2c-to-ggml)
         # these examples use the backends directly and cannot be built with dynamic loading
 
@@ -0,0 +1,28 @@
+cmake_minimum_required(VERSION 3.10)
+project(llama_sweep_bench)
+
+set(TARGET llama-sweep-bench)
+
+find_package(CUDAToolkit REQUIRED)
+
+add_executable(${TARGET}
+    sweep-bench.cpp
+    cuda_p2p_bench.cpp
+    nccl_allreduce_bench.cpp
+    nccl_sendrecv_bench.cpp
+)
+
+# Inherit CUDA settings from parent; no need to enable_language(CUDA) or set includes
+
+install(TARGETS ${TARGET} RUNTIME DESTINATION bin)
+
+if(TARGET CUDA::cudart_static)
+    target_link_libraries(${TARGET} PRIVATE common llama CUDA::cudart_static nccl)
+elseif(DEFINED CUDA_CUDART_LIBRARY)
+    target_link_libraries(${TARGET} PRIVATE common llama ${CUDA_CUDART_LIBRARY} nccl)
+else()
+    message(FATAL_ERROR "Could not find CUDA runtime library target or variable. Please ensure CUDA is installed and CMake can find cudart_static or CUDA_CUDART_LIBRARY.")
+endif()
+
+find_library(NCCL_LIBRARY nccl REQUIRED)
+target_link_libraries(${TARGET} PRIVATE ${NCCL_LIBRARY})
@@ -0,0 +1,48 @@
+# llama.cpp/tools/sweep-bench
+
+Benchmark the prompt processing and token generation performance of `llama.cpp`
+by doing a sweep over a whole context size and gathering performance metrics
+in each ubatch-sized window. Only a single token sequence is used.
+
+The benchmark steps are:
+
+for each ubatch-sized window in context:
+
+    1. generate ubatch/4 tokens (not the whole window to save some time)
+    2. measure generation performance
+    3. prepare a ubatch-sized batch of random tokens
+    4. process prepared batch
+    5. measure prompt processing performance
+
+The purpose of the benchmark is to visualize how the performance changes with
+the context size without averaging the metrics values over the whole context.
+
+## Usage
+
+```bash
+./llama-sweep-bench -c 8704 -ub 512 -m models/Meta-Llama-3.2-3B-Instruct-Q8_0.gguf
+```
+
+## Sample results
+
+- `PP` - prompt tokens per ubatch
+- `TG` - generated tokens per ubatch
+- `N_KV` - current KV cache size
+- `T_PP` - prompt processing time (i.e. time to first token)
+- `S_PP` - prompt processing speed (`(B*PP)/T_PP` or `PP/T_PP`)
+- `T_TG` - time to generate all batches
+- `S_TG` - text generation speed (`(B*TG)/T_TG`)
+
+|    PP |     TG |   N_KV |   T_PP s | S_PP t/s |   T_TG s | S_TG t/s |
+|-------|--------|--------|----------|----------|----------|----------|
+|  1024 |    256 |      0 |  375.321 |     2.73 |   94.977 |     2.70 |
+|  1024 |    256 |   1024 |  416.327 |     2.46 |  113.177 |     2.26 |
+
+### JSONL output
+
+Pass `--batched-bench-output-jsonl` to output JSONL instead of Markdown, á la
+
+```json lines
+{"n_kv_max": 2048, "n_batch": 2048, "n_ubatch": 1024, "flash_attn": 1, "n_gpu_layers": 99, "n_threads": 48, "n_threads_batch": 48, "pp": 1024, "tg": 256, "n_kv": 0, "t_pp": 375.321000, "speed_pp": 2.730000, "t_tg": 94.977000, "speed_tg": 2.700000 }
+{"n_kv_max": 2048, "n_batch": 2048, "n_ubatch": 1024, "flash_attn": 1, "n_gpu_layers": 99, "n_threads": 48, "n_threads_batch": 48, "pp": 1024, "tg": 256, "n_kv": 1024, "t_pp": 416.327000, "speed_pp": 2.460000, "t_tg": 113.177000, "speed_tg": 2.260000 }
+```
@@ -0,0 +1,131 @@
+#include "cuda_p2p_bench.h"
+#include <cstdio>
+#include <cuda_runtime.h>
+#include <thread>
+#include <algorithm>
+
+void run_cuda_p2p_bench() {
+    int num_devices = 0;
+    cudaGetDeviceCount(&num_devices);
+    if (num_devices < 2) {
+        printf("Need at least 2 GPUs for CUDA P2P test.\n");
+        return;
+    }
+    printf("CUDA P2P communication paths (PCI bus IDs):\n");
+    for (int i = 0; i < num_devices; ++i) {
+        cudaDeviceProp prop_i;
+        cudaGetDeviceProperties(&prop_i, i);
+        for (int j = 0; j < num_devices; ++j) {
+            if (i == j) continue;
+            cudaDeviceProp prop_j;
+            cudaGetDeviceProperties(&prop_j, j);
+            int can_access = 0;
+            cudaDeviceCanAccessPeer(&can_access, i, j);
+            printf("  GPU %d (%04x:%02x:%02x) <-> GPU %d (%04x:%02x:%02x) : P2P %s\n",
+                i, prop_i.pciDomainID, prop_i.pciBusID, prop_i.pciDeviceID,
+                j, prop_j.pciDomainID, prop_j.pciBusID, prop_j.pciDeviceID,
+                can_access ? "ENABLED" : "DISABLED");
+        }
+    }
+    // Enable peer access
+    for (int i = 0; i < num_devices; ++i) {
+        cudaSetDevice(i);
+        for (int j = 0; j < num_devices; ++j) {
+            if (i != j) cudaDeviceEnablePeerAccess(j, 0);
+        }
+    }
+    // Bandwidth and bi-directional bandwidth test for each pair
+    size_t size = 1024 * 1024 * 1024; // 64 MB
+    const int num_runs = 100;
+    for (int i = 0; i < num_devices; ++i) {
+        for (int j = 0; j < num_devices; ++j) {
+            if (i == j) continue;
+            cudaSetDevice(i);
+            void *src, *dst;
+            cudaMalloc(&src, size);
+            cudaSetDevice(j);
+            cudaMalloc(&dst, size);
+            cudaStream_t stream;
+            cudaStreamCreate(&stream);
+            // Bandwidth test (j -> i)
+            float total_ms = 0.0f;
+            for (int run = 0; run < num_runs; ++run) {
+                cudaEvent_t start, stop;
+                cudaEventCreate(&start);
+                cudaEventCreate(&stop);
+                cudaEventRecord(start, stream);
+                cudaMemcpyPeerAsync(dst, j, src, i, size, stream);
+                cudaEventRecord(stop, stream);
+                cudaStreamSynchronize(stream);
+                float ms = 0.0f;
+                cudaEventElapsedTime(&ms, start, stop);
+                total_ms += ms;
+                cudaEventDestroy(start);
+                cudaEventDestroy(stop);
+            }
+            float avg_ms = total_ms / num_runs;
+            float bandwidth = (float)size / (avg_ms * 1e6f); // GB/s
+            printf("GPU %d <-> GPU %d: P2P access ENABLED\n  Bandwidth GPU %d -> GPU %d: %.2f GB/s (avg over %d runs)\n",
+                i, j, j, i, bandwidth, num_runs);
+            // Bi-directional bandwidth test (true parallel)
+            cudaStream_t stream0, stream1;
+            cudaSetDevice(i);
+            cudaStreamCreate(&stream0);
+            cudaSetDevice(j);
+            cudaStreamCreate(&stream1);
+            float total_bi_ms = 0.0f;
+            for (int run = 0; run < num_runs; ++run) {
+                cudaEvent_t start0, stop0, start1, stop1;
+                cudaSetDevice(i);
+                cudaEventCreate(&start0);
+                cudaEventCreate(&stop0);
+                cudaSetDevice(j);
+                cudaEventCreate(&start1);
+                cudaEventCreate(&stop1);
+                cudaSetDevice(i);
+                cudaEventRecord(start0, stream0);
+                cudaSetDevice(j);
+                cudaEventRecord(start1, stream1);
+                // Launch both directions in parallel
+                std::thread t0([&]() {
+                    cudaSetDevice(i);
+                    cudaMemcpyPeerAsync(dst, j, src, i, size, stream0);
+                    cudaEventRecord(stop0, stream0);
+                });
+                std::thread t1([&]() {
+                    cudaSetDevice(j);
+                    cudaMemcpyPeerAsync(src, i, dst, j, size, stream1);
+                    cudaEventRecord(stop1, stream1);
+                });
+                t0.join();
+                t1.join();
+                cudaSetDevice(i);
+                cudaStreamSynchronize(stream0);
+                cudaSetDevice(j);
+                cudaStreamSynchronize(stream1);
+                float ms0 = 0.0f, ms1 = 0.0f;
+                cudaEventElapsedTime(&ms0, start0, stop0);
+                cudaEventElapsedTime(&ms1, start1, stop1);
+                float ms = std::max(ms0, ms1);
+                total_bi_ms += ms;
+                cudaSetDevice(i);
+                cudaEventDestroy(start0);
+                cudaEventDestroy(stop0);
+                cudaSetDevice(j);
+                cudaEventDestroy(start1);
+                cudaEventDestroy(stop1);
+            }
+            float avg_bi_ms = total_bi_ms / num_runs;
+            float bi_bandwidth = 2.0f * (float)size / (avg_bi_ms * 1e6f); // GB/s
+            printf("  Bi-directional bandwidth GPU %d <-> GPU %d: %.2f GB/s (avg over %d runs)\n",
+                i, j, bi_bandwidth, num_runs);
+            cudaStreamDestroy(stream);
+            cudaSetDevice(i);
+            cudaStreamDestroy(stream0);
+            cudaFree(src);
+            cudaSetDevice(j);
+            cudaStreamDestroy(stream1);
+            cudaFree(dst);
+        }
+    }
+}
@@ -0,0 +1,2 @@
+#pragma once
+void run_cuda_p2p_bench();
@@ -0,0 +1,67 @@
+#include "nccl_allreduce_bench.h"
+#include <cstdio>
+#include <vector>
+#include <thread>
+#include <cuda_runtime.h>
+#include <nccl.h>
+
+void run_nccl_allreduce_bench() {
+    int num_devices = 0;
+    cudaGetDeviceCount(&num_devices);
+    if (num_devices < 2) {
+        printf("Need at least 2 GPUs for NCCL AllReduce test.\n");
+        return;
+    }
+    printf("Warming up NCCL communication on all GPUs...\n");
+    size_t size = 1024 * 1024 * 1024; // 1 GB
+    std::vector<void*> sendbuffs(num_devices), recvbuffs(num_devices);
+    std::vector<cudaStream_t> streams(num_devices);
+    for (int i = 0; i < num_devices; ++i) {
+        cudaSetDevice(i);
+        cudaMalloc(&sendbuffs[i], size);
+        cudaMalloc(&recvbuffs[i], size);
+        cudaStreamCreate(&streams[i]);
+    }
+    std::vector<ncclComm_t> comms(num_devices);
+    std::vector<int> devs(num_devices);
+    for (int i = 0; i < num_devices; ++i) devs[i] = i;
+    ncclCommInitAll(comms.data(), num_devices, devs.data());
+    const int num_runs = 100;
+    std::vector<float> total_ms(num_devices, 0.0f);
+    for (int run = 0; run < num_runs; ++run) {
+        std::vector<std::thread> threads;
+        std::vector<cudaEvent_t> starts(num_devices), stops(num_devices);
+        for (int i = 0; i < num_devices; ++i) {
+            cudaSetDevice(i);
+            cudaEventCreate(&starts[i]);
+            cudaEventCreate(&stops[i]);
+        }
+        for (int i = 0; i < num_devices; ++i) {
+            threads.emplace_back([i, &comms, &sendbuffs, &recvbuffs, &streams, &starts, &stops, size]() {
+                cudaSetDevice(i);
+                cudaEventRecord(starts[i], streams[i]);
+                ncclAllReduce(sendbuffs[i], recvbuffs[i], size / sizeof(float), ncclFloat, ncclSum, comms[i], streams[i]);
+                cudaEventRecord(stops[i], streams[i]);
+                cudaStreamSynchronize(streams[i]);
+            });
+        }
+        for (auto& t : threads) t.join();
+        for (int i = 0; i < num_devices; ++i) {
+            float ms = 0.0f;
+            cudaEventSynchronize(stops[i]);
+            cudaEventElapsedTime(&ms, starts[i], stops[i]);
+            total_ms[i] += ms;
+            cudaEventDestroy(starts[i]);
+            cudaEventDestroy(stops[i]);
+        }
+    }
+    for (int i = 0; i < num_devices; ++i) {
+        cudaFree(sendbuffs[i]);
+        cudaFree(recvbuffs[i]);
+        cudaStreamDestroy(streams[i]);
+        float avg_ms = total_ms[i] / num_runs;
+        float bandwidth = (float)size / (avg_ms * 1e6f); // GB/s
+        printf("NCCL Bandwidth GPU %d: %.2f GB/s (avg over %d runs)\n", i, bandwidth, num_runs);
+        ncclCommDestroy(comms[i]);
+    }
+}
@@ -0,0 +1,2 @@
+#pragma once
+void run_nccl_allreduce_bench();
@@ -0,0 +1,76 @@
+#include "nccl_sendrecv_bench.h"
+#include <cstdio>
+#include <thread>
+#include <cuda_runtime.h>
+#include <nccl.h>
+#include <algorithm> // for std::max
+
+void run_nccl_sendrecv_bench() {
+    int num_devices = 0;
+    cudaGetDeviceCount(&num_devices);
+    if (num_devices < 2) {
+        printf("Need at least 2 GPUs for NCCL Send/Recv test.\n");
+        return;
+    }
+    size_t size = 1024 * 1024 * 1024; // 1 GB
+    void *sendbuffs[2], *recvbuffs[2];
+    cudaStream_t streams0[2], streams1[2];
+    for (int i = 0; i < 2; ++i) {
+        cudaSetDevice(i);
+        cudaMalloc(&sendbuffs[i], size);
+        cudaMalloc(&recvbuffs[i], size);
+        cudaStreamCreate(&streams0[i]);
+        cudaStreamCreate(&streams1[i]);
+    }
+    ncclComm_t comms[2];
+    int devs[2] = {0, 1};
+    ncclCommInitAll(comms, 2, devs);
+    const int num_runs = 100;
+    float total_ms[2] = {0.0f, 0.0f};
+    for (int run = 0; run < num_runs; ++run) {
+        float ms[2] = {0.0f, 0.0f};
+        std::thread threads[2];
+        for (int rank = 0; rank < 2; ++rank) {
+            threads[rank] = std::thread([rank, &comms, &sendbuffs, &recvbuffs, &streams0, &streams1, size, &ms]() {
+                cudaSetDevice(rank);
+                cudaEvent_t start0, stop0, start1, stop1;
+                cudaEventCreate(&start0);
+                cudaEventCreate(&stop0);
+                cudaEventCreate(&start1);
+                cudaEventCreate(&stop1);
+                cudaEventRecord(start0, streams0[rank]);
+                cudaEventRecord(start1, streams1[rank]);
+                ncclGroupStart();
+                ncclSend(sendbuffs[rank], size / sizeof(float), ncclFloat, 1 - rank, comms[rank], streams0[rank]);
+                ncclRecv(recvbuffs[rank], size / sizeof(float), ncclFloat, 1 - rank, comms[rank], streams1[rank]);
+                ncclGroupEnd();
+                cudaEventRecord(stop0, streams0[rank]);
+                cudaEventRecord(stop1, streams1[rank]);
+                cudaStreamSynchronize(streams0[rank]);
+                cudaStreamSynchronize(streams1[rank]);
+                float ms0 = 0.0f, ms1 = 0.0f;
+                cudaEventElapsedTime(&ms0, start0, stop0);
+                cudaEventElapsedTime(&ms1, start1, stop1);
+                ms[rank] = std::max(ms0, ms1);
+                cudaEventDestroy(start0);
+                cudaEventDestroy(stop0);
+                cudaEventDestroy(start1);
+                cudaEventDestroy(stop1);
+            });
+        }
+        threads[0].join();
+        threads[1].join();
+        total_ms[0] += ms[0];
+        total_ms[1] += ms[1];
+    }
+    for (int i = 0; i < 2; ++i) {
+        float avg_ms = total_ms[i] / num_runs;
+        float bandwidth = 2.0f * (float)size / (avg_ms * 1e6f); // GB/s (bi-directional)
+        printf("NCCL Send/Recv Bi-directional Bandwidth GPU %d: %.2f GB/s (avg over %d runs)\n", i, bandwidth, num_runs);
+        cudaFree(sendbuffs[i]);
+        cudaFree(recvbuffs[i]);
+        cudaStreamDestroy(streams0[i]);
+        cudaStreamDestroy(streams1[i]);
+        ncclCommDestroy(comms[i]);
+    }
+}
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+#pragma once`
	`2`	`+void run_cuda_p2p_bench();`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+#pragma once`
	`2`	`+void run_nccl_allreduce_bench();`