Skip to content

Commit dc50d54

Browse files
committed
examples : add idle tool for investigating GPU idle overhead
1 parent ab3d71f commit dc50d54

File tree

5 files changed

+153
-0
lines changed

5 files changed

+153
-0
lines changed

Makefile

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ BUILD_TARGETS = \
1515
llama-gguf-hash \
1616
llama-gguf-split \
1717
llama-gritlm \
18+
llama-idle \
1819
llama-imatrix \
1920
llama-infill \
2021
llama-llava-cli \
@@ -1282,6 +1283,11 @@ llama-infill: examples/infill/infill.cpp \
12821283
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
12831284
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
12841285

1286+
llama-idle: examples/idle/idle.cpp \
1287+
$(OBJ_ALL)
1288+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
1289+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
1290+
12851291
llama-simple: examples/simple/simple.cpp \
12861292
$(OBJ_ALL)
12871293
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)

examples/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ else()
2525
add_subdirectory(gguf-split)
2626
add_subdirectory(gguf)
2727
add_subdirectory(gritlm)
28+
add_subdirectory(idle)
2829
add_subdirectory(imatrix)
2930
add_subdirectory(infill)
3031
add_subdirectory(llama-bench)

examples/idle/CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
set(TARGET llama-idle)
2+
add_executable(${TARGET} idle.cpp)
3+
install(TARGETS ${TARGET} RUNTIME)
4+
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
5+
target_compile_features(${TARGET} PRIVATE cxx_std_11)

examples/idle/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# llama.cpp/example/idle
2+
3+

examples/idle/idle.cpp

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
#include "llama.h"
2+
#include <cstdio>
3+
#include <cstring>
4+
#include <string>
5+
#include <vector>
6+
#include <thread>
7+
8+
static void print_usage(int /*argc*/, char ** argv) {
9+
printf("\nexample usage:\n");
10+
printf("\n %s -m model.gguf [-ngl n_gpu_layers]\n", argv[0]);
11+
printf("\n");
12+
}
13+
14+
int main(int argc, char ** argv) {
15+
// path to the model gguf file
16+
std::string model_path;
17+
// number of layers to offload to the GPU
18+
int ngl = 99;
19+
20+
// parse command line arguments
21+
22+
{
23+
int i = 1;
24+
for (; i < argc; i++) {
25+
if (strcmp(argv[i], "-m") == 0) {
26+
if (i + 1 < argc) {
27+
model_path = argv[++i];
28+
} else {
29+
print_usage(argc, argv);
30+
return 1;
31+
}
32+
} else if (strcmp(argv[i], "-ngl") == 0) {
33+
if (i + 1 < argc) {
34+
try {
35+
ngl = std::stoi(argv[++i]);
36+
} catch (...) {
37+
print_usage(argc, argv);
38+
return 1;
39+
}
40+
} else {
41+
print_usage(argc, argv);
42+
return 1;
43+
}
44+
} else {
45+
// prompt starts here
46+
break;
47+
}
48+
}
49+
if (model_path.empty()) {
50+
print_usage(argc, argv);
51+
return 1;
52+
}
53+
}
54+
55+
// initialize the model
56+
57+
llama_model_params model_params = llama_model_default_params();
58+
model_params.n_gpu_layers = ngl;
59+
60+
llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params);
61+
62+
if (model == NULL) {
63+
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
64+
return 1;
65+
}
66+
67+
const int n_prompt = 1;
68+
69+
// allocate space for the tokens and tokenize the prompt
70+
std::vector<llama_token> prompt_tokens(n_prompt, llama_token_bos(model));
71+
72+
// initialize the context
73+
74+
llama_context_params ctx_params = llama_context_default_params();
75+
ctx_params.n_ctx = 512;
76+
ctx_params.n_batch = 512;
77+
ctx_params.no_perf = false;
78+
79+
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
80+
81+
if (ctx == NULL) {
82+
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
83+
return 1;
84+
}
85+
86+
llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
87+
88+
const int n_iters = 10;
89+
90+
// warm-up
91+
llama_decode(ctx, batch);
92+
llama_kv_cache_clear (ctx);
93+
llama_kv_cache_update(ctx);
94+
llama_synchronize (ctx);
95+
96+
for (int64_t t_pause_ms = 0; t_pause_ms <= 2200; t_pause_ms += 200) {
97+
double t_sum_us = 0.0;
98+
double t_sum2_us = 0.0;
99+
100+
for (int i = 0; i < n_iters; i++) {
101+
// this pause is important - it simulates "idle GPU"
102+
std::this_thread::sleep_for(std::chrono::milliseconds(t_pause_ms));
103+
104+
const int64_t t_start_us = llama_time_us();
105+
106+
// this should take constant time
107+
llama_decode(ctx, batch);
108+
llama_synchronize(ctx);
109+
110+
const int64_t t_end_us = llama_time_us();
111+
112+
const double t_cur_us = t_end_us - t_start_us;
113+
114+
#if 0
115+
// print individual decode times
116+
printf(" - decode time: %8.2f ms\n", t_cur_us / 1000);
117+
#endif
118+
119+
t_sum_us += t_cur_us;
120+
t_sum2_us += t_cur_us * t_cur_us;
121+
122+
llama_kv_cache_clear (ctx);
123+
llama_kv_cache_update(ctx);
124+
llama_synchronize (ctx); // just in case
125+
}
126+
127+
const double t_avg_us = t_sum_us / n_iters;
128+
const double t_dev_us = sqrt((t_sum2_us / (n_iters - 1)) - (t_avg_us * t_avg_us * n_iters) / (n_iters - 1));
129+
130+
printf("iters: %4d, pause: %5d ms, avg decode time: %8.2f +/- %4.2f ms\n", n_iters, (int) t_pause_ms, t_avg_us / 1000, t_dev_us / 1000);
131+
fflush(stdout);
132+
}
133+
134+
llama_free(ctx);
135+
llama_free_model(model);
136+
137+
return 0;
138+
}

0 commit comments

Comments
 (0)