Skip to content

Commit c41bde6

Browse files
authored
metal : add residency sets keep-alive heartbeat (#17766)
* examples : add idle * metal : attach residency sets to queue * idle : add link * idle : adjust intervals * metal : add residency sets keep-alive heartbeat * cont : adjust default keep-alive time
1 parent 6016d0b commit c41bde6

File tree

7 files changed

+320
-43
lines changed

7 files changed

+320
-43
lines changed

examples/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ else()
2020

2121
add_subdirectory(gguf-hash)
2222
add_subdirectory(gguf)
23+
add_subdirectory(idle)
2324
add_subdirectory(lookahead)
2425
add_subdirectory(lookup)
2526
add_subdirectory(parallel)

examples/idle/CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
set(TARGET llama-idle)
2+
add_executable(${TARGET} idle.cpp)
3+
install(TARGETS ${TARGET} RUNTIME)
4+
target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
5+
target_compile_features(${TARGET} PRIVATE cxx_std_11)

examples/idle/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# llama.cpp/example/idle
2+
3+
https://github.com/ggml-org/llama.cpp/pull/17766

examples/idle/idle.cpp

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
#include "arg.h"
2+
#include "common.h"
3+
#include "log.h"
4+
#include "llama.h"
5+
6+
#include <cmath>
7+
#include <cstdio>
8+
#include <cstring>
9+
#include <string>
10+
#include <thread>
11+
#include <vector>
12+
13+
static void print_usage(int /*argc*/, char ** argv) {
14+
printf("\nexample usage:\n");
15+
printf("\n %s -m model.gguf [-ngl n_gpu_layers]\n", argv[0]);
16+
printf("\n");
17+
}
18+
19+
int main(int argc, char ** argv) {
20+
common_params params;
21+
22+
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
23+
return 1;
24+
}
25+
26+
common_init();
27+
28+
// init LLM
29+
30+
llama_backend_init();
31+
llama_numa_init(params.numa);
32+
33+
// initialize the model
34+
35+
llama_model_params model_params = common_model_params_to_llama(params);
36+
37+
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
38+
39+
if (model == NULL) {
40+
LOG_ERR("%s: error: unable to load model\n" , __func__);
41+
return 1;
42+
}
43+
44+
const llama_vocab * vocab = llama_model_get_vocab(model);
45+
46+
// we need just a dummy token to evaluate
47+
std::vector<llama_token> prompt_tokens(1, llama_vocab_bos(vocab));
48+
49+
llama_context_params ctx_params = llama_context_default_params();
50+
ctx_params.n_ctx = 512;
51+
ctx_params.n_batch = 512;
52+
ctx_params.no_perf = false;
53+
54+
llama_context * ctx = llama_init_from_model(model, ctx_params);
55+
if (ctx == NULL) {
56+
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
57+
return 1;
58+
}
59+
60+
llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
61+
62+
const int n_iters = 3;
63+
64+
// warm-up
65+
llama_decode(ctx, batch);
66+
llama_memory_clear(llama_get_memory(ctx), true);
67+
llama_synchronize(ctx);
68+
69+
for (int64_t t_pause_ms = 0; t_pause_ms <= 4000; t_pause_ms += 800) {
70+
double t_sum_us = 0.0;
71+
double t_sum2_us = 0.0;
72+
73+
for (int i = 0; i < n_iters; i++) {
74+
// this pause is important - it simulates "idle GPU"
75+
std::this_thread::sleep_for(std::chrono::milliseconds(t_pause_ms));
76+
77+
const int64_t t_start_us = llama_time_us();
78+
79+
// this should take constant time
80+
llama_decode(ctx, batch);
81+
llama_synchronize(ctx);
82+
83+
const int64_t t_end_us = llama_time_us();
84+
85+
const double t_cur_us = t_end_us - t_start_us;
86+
87+
#if 1
88+
// print individual decode times
89+
printf(" - decode time: %8.2f ms\n", t_cur_us / 1000);
90+
#endif
91+
92+
t_sum_us += t_cur_us;
93+
t_sum2_us += t_cur_us * t_cur_us;
94+
95+
llama_memory_clear(llama_get_memory(ctx), true);
96+
llama_synchronize(ctx); // just in case
97+
}
98+
99+
const double t_avg_us = t_sum_us / n_iters;
100+
const double t_dev_us = sqrt((t_sum2_us / (n_iters - 1)) - (t_avg_us * t_avg_us * n_iters) / (n_iters - 1));
101+
102+
printf("iters: %4d, pause: %5d ms, avg decode time: %8.2f +/- %4.2f ms\n", n_iters, (int) t_pause_ms, t_avg_us / 1000, t_dev_us / 1000);
103+
fflush(stdout);
104+
}
105+
106+
llama_free(ctx);
107+
llama_model_free(model);
108+
109+
return 0;
110+
}

ggml/src/ggml-metal/ggml-metal-context.m

Lines changed: 26 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,6 @@
2424
};
2525

2626
struct ggml_metal {
27-
id<MTLDevice> device;
28-
id<MTLCommandQueue> queue; // currently a pointer to the device queue, but might become separate queue [TAG_QUEUE_PER_BACKEND]
29-
3027
ggml_metal_device_t dev;
3128
ggml_metal_library_t lib;
3229

@@ -91,15 +88,15 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
9188
// init context
9289
ggml_metal_t res = calloc(1, sizeof(struct ggml_metal));
9390

94-
res->device = ggml_metal_device_get_obj(dev);
91+
id<MTLDevice> device = ggml_metal_device_get_obj(dev);
9592

96-
GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[res->device name] UTF8String]);
93+
GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
9794

9895
// TODO: would it be better to have one queue for the backend and one queue for the device?
9996
// the graph encoders and async ops would use the backend queue while the sync ops would use the device queue?
10097
//res->queue = [device newCommandQueue]; [TAG_QUEUE_PER_BACKEND]
101-
res->queue = ggml_metal_device_get_queue(dev);
102-
if (res->queue == nil) {
98+
id<MTLCommandQueue> queue = ggml_metal_device_get_queue(dev);
99+
if (queue == nil) {
103100
GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
104101
return NULL;
105102
}
@@ -274,7 +271,8 @@ static struct ggml_metal_buffer_id ggml_metal_get_buffer_id(const struct ggml_te
274271
void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
275272
@autoreleasepool {
276273
// wrap the source data into a Metal buffer
277-
id<MTLBuffer> buf_src = [ctx->device newBufferWithBytes:data
274+
id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
275+
id<MTLBuffer> buf_src = [device newBufferWithBytes:data
278276
length:size
279277
options:MTLResourceStorageModeShared];
280278

@@ -289,7 +287,8 @@ void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor,
289287

290288
// queue the copy operation into the queue of the Metal context
291289
// this will be queued at the end, after any currently ongoing GPU operations
292-
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
290+
id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx->dev);
291+
id<MTLCommandBuffer> cmd_buf = [queue commandBuffer];
293292
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
294293

295294
[encoder copyFromBuffer:buf_src
@@ -315,7 +314,8 @@ void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor,
315314

316315
void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
317316
@autoreleasepool {
318-
id<MTLBuffer> buf_dst = [ctx->device newBufferWithBytesNoCopy:data
317+
id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
318+
id<MTLBuffer> buf_dst = [device newBufferWithBytesNoCopy:data
319319
length:size
320320
options:MTLResourceStorageModeShared
321321
deallocator:nil];
@@ -331,7 +331,8 @@ void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * te
331331

332332
// queue the copy operation into the queue of the Metal context
333333
// this will be queued at the end, after any currently ongoing GPU operations
334-
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
334+
id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx->dev);
335+
id<MTLCommandBuffer> cmd_buf = [queue commandBuffer];
335336
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
336337

337338
[encoder copyFromBuffer:bid_src.metal
@@ -362,6 +363,9 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
362363
// number of threads in addition to the main thread
363364
const int n_cb = ctx->n_cb;
364365

366+
// keep the memory wired
367+
ggml_metal_device_rsets_keep_alive(ctx->dev);
368+
365369
// submit the ggml compute graph to the GPU by creating command buffers and encoding the ops in them
366370
// the first n_nodes_0 are encoded and submitted for processing directly by the calling thread
367371
// while these nodes are processing, we start n_cb threads to enqueue the rest of the nodes
@@ -389,7 +393,8 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
389393

390394
if (!ctx->capture_started) {
391395
// create capture scope
392-
ctx->capture_scope = [[MTLCaptureManager sharedCaptureManager] newCaptureScopeWithDevice:ctx->device];
396+
id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
397+
ctx->capture_scope = [[MTLCaptureManager sharedCaptureManager] newCaptureScopeWithDevice:device];
393398

394399
MTLCaptureDescriptor * descriptor = [MTLCaptureDescriptor new];
395400
descriptor.captureObject = ctx->capture_scope;
@@ -406,10 +411,13 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
406411
}
407412
}
408413

414+
// short-hand
415+
id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx->dev);
416+
409417
// the main thread commits the first few commands immediately
410418
// cmd_buf[n_cb]
411419
{
412-
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
420+
id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
413421
[cmd_buf retain];
414422

415423
if (ctx->cmd_bufs[n_cb].obj) {
@@ -428,7 +436,7 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
428436
// prepare the rest of the command buffers asynchronously (optional)
429437
// cmd_buf[0.. n_cb)
430438
for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
431-
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
439+
id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
432440
[cmd_buf retain];
433441

434442
if (ctx->cmd_bufs[cb_idx].obj) {
@@ -589,9 +597,11 @@ void ggml_metal_set_abort_callback(ggml_metal_t ctx, ggml_abort_callback abort_c
589597
}
590598

591599
bool ggml_metal_supports_family(ggml_metal_t ctx, int family) {
592-
GGML_ASSERT(ctx->device != nil);
600+
GGML_ASSERT(ctx->dev != nil);
601+
602+
id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
593603

594-
return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
604+
return [device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
595605
}
596606

597607
void ggml_metal_capture_next_compute(ggml_metal_t ctx) {

ggml/src/ggml-metal/ggml-metal-device.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,16 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_att
186186
int32_t dv,
187187
int32_t nwg);
188188

189+
// MTLResidencySet wrapper
190+
191+
typedef void * ggml_metal_rset_t;
192+
193+
// a collection of residency sets (non-owning)
194+
typedef struct ggml_metal_rsets * ggml_metal_rsets_t;
195+
196+
ggml_metal_rsets_t ggml_metal_rsets_init(void);
197+
void ggml_metal_rsets_free(ggml_metal_rsets_t rsets);
198+
189199
//
190200
// device
191201
//
@@ -219,6 +229,11 @@ void * ggml_metal_device_get_queue(ggml_metal_device_t dev); // id<MTLCommandQue
219229

220230
ggml_metal_library_t ggml_metal_device_get_library(ggml_metal_device_t dev);
221231

232+
void ggml_metal_device_rsets_add(ggml_metal_device_t dev, ggml_metal_rset_t rset);
233+
void ggml_metal_device_rsets_rm (ggml_metal_device_t dev, ggml_metal_rset_t rset);
234+
235+
void ggml_metal_device_rsets_keep_alive(ggml_metal_device_t dev);
236+
222237
void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total);
223238
bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_tensor * op);
224239

0 commit comments

Comments
 (0)