Skip to content

Commit f484322

Browse files
committed
examples : add idle tool for investigating GPU idle overhead
1 parent ab3d71f commit f484322

File tree

5 files changed

+155
-0
lines changed

5 files changed

+155
-0
lines changed

Makefile

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ BUILD_TARGETS = \
1515
llama-gguf-hash \
1616
llama-gguf-split \
1717
llama-gritlm \
18+
llama-idle \
1819
llama-imatrix \
1920
llama-infill \
2021
llama-llava-cli \
@@ -1282,6 +1283,11 @@ llama-infill: examples/infill/infill.cpp \
12821283
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
12831284
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
12841285

1286+
llama-idle: examples/idle/idle.cpp \
1287+
$(OBJ_ALL)
1288+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
1289+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
1290+
12851291
llama-simple: examples/simple/simple.cpp \
12861292
$(OBJ_ALL)
12871293
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)

examples/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ else()
2525
add_subdirectory(gguf-split)
2626
add_subdirectory(gguf)
2727
add_subdirectory(gritlm)
28+
add_subdirectory(idle)
2829
add_subdirectory(imatrix)
2930
add_subdirectory(infill)
3031
add_subdirectory(llama-bench)

examples/idle/CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
set(TARGET llama-idle)
2+
add_executable(${TARGET} idle.cpp)
3+
install(TARGETS ${TARGET} RUNTIME)
4+
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
5+
target_compile_features(${TARGET} PRIVATE cxx_std_11)

examples/idle/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# llama.cpp/example/idle
2+
3+

examples/idle/idle.cpp

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
#include "llama.h"
2+
3+
#include <cmath>
4+
#include <cstdio>
5+
#include <cstring>
6+
#include <string>
7+
#include <thread>
8+
#include <vector>
9+
10+
static void print_usage(int /*argc*/, char ** argv) {
11+
printf("\nexample usage:\n");
12+
printf("\n %s -m model.gguf [-ngl n_gpu_layers]\n", argv[0]);
13+
printf("\n");
14+
}
15+
16+
int main(int argc, char ** argv) {
17+
// path to the model gguf file
18+
std::string model_path;
19+
// number of layers to offload to the GPU
20+
int ngl = 99;
21+
22+
// parse command line arguments
23+
24+
{
25+
int i = 1;
26+
for (; i < argc; i++) {
27+
if (strcmp(argv[i], "-m") == 0) {
28+
if (i + 1 < argc) {
29+
model_path = argv[++i];
30+
} else {
31+
print_usage(argc, argv);
32+
return 1;
33+
}
34+
} else if (strcmp(argv[i], "-ngl") == 0) {
35+
if (i + 1 < argc) {
36+
try {
37+
ngl = std::stoi(argv[++i]);
38+
} catch (...) {
39+
print_usage(argc, argv);
40+
return 1;
41+
}
42+
} else {
43+
print_usage(argc, argv);
44+
return 1;
45+
}
46+
} else {
47+
// prompt starts here
48+
break;
49+
}
50+
}
51+
if (model_path.empty()) {
52+
print_usage(argc, argv);
53+
return 1;
54+
}
55+
}
56+
57+
// initialize the model
58+
59+
llama_model_params model_params = llama_model_default_params();
60+
model_params.n_gpu_layers = ngl;
61+
62+
llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params);
63+
64+
if (model == NULL) {
65+
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
66+
return 1;
67+
}
68+
69+
const int n_prompt = 1;
70+
71+
// allocate space for the tokens and tokenize the prompt
72+
std::vector<llama_token> prompt_tokens(n_prompt, llama_token_bos(model));
73+
74+
// initialize the context
75+
76+
llama_context_params ctx_params = llama_context_default_params();
77+
ctx_params.n_ctx = 512;
78+
ctx_params.n_batch = 512;
79+
ctx_params.no_perf = false;
80+
81+
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
82+
83+
if (ctx == NULL) {
84+
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
85+
return 1;
86+
}
87+
88+
llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
89+
90+
const int n_iters = 10;
91+
92+
// warm-up
93+
llama_decode(ctx, batch);
94+
llama_kv_cache_clear (ctx);
95+
llama_kv_cache_update(ctx);
96+
llama_synchronize (ctx);
97+
98+
for (int64_t t_pause_ms = 0; t_pause_ms <= 2200; t_pause_ms += 200) {
99+
double t_sum_us = 0.0;
100+
double t_sum2_us = 0.0;
101+
102+
for (int i = 0; i < n_iters; i++) {
103+
// this pause is important - it simulates "idle GPU"
104+
std::this_thread::sleep_for(std::chrono::milliseconds(t_pause_ms));
105+
106+
const int64_t t_start_us = llama_time_us();
107+
108+
// this should take constant time
109+
llama_decode(ctx, batch);
110+
llama_synchronize(ctx);
111+
112+
const int64_t t_end_us = llama_time_us();
113+
114+
const double t_cur_us = t_end_us - t_start_us;
115+
116+
#if 0
117+
// print individual decode times
118+
printf(" - decode time: %8.2f ms\n", t_cur_us / 1000);
119+
#endif
120+
121+
t_sum_us += t_cur_us;
122+
t_sum2_us += t_cur_us * t_cur_us;
123+
124+
llama_kv_cache_clear (ctx);
125+
llama_kv_cache_update(ctx);
126+
llama_synchronize (ctx); // just in case
127+
}
128+
129+
const double t_avg_us = t_sum_us / n_iters;
130+
const double t_dev_us = sqrt((t_sum2_us / (n_iters - 1)) - (t_avg_us * t_avg_us * n_iters) / (n_iters - 1));
131+
132+
printf("iters: %4d, pause: %5d ms, avg decode time: %8.2f +/- %4.2f ms\n", n_iters, (int) t_pause_ms, t_avg_us / 1000, t_dev_us / 1000);
133+
fflush(stdout);
134+
}
135+
136+
llama_free(ctx);
137+
llama_free_model(model);
138+
139+
return 0;
140+
}

0 commit comments

Comments
 (0)