Skip to content

Commit 94fb1a5

Browse files
authored
Merge pull request #7 from kpouget/remoting
Remoting
2 parents 484d5a0 + 5b5ffec commit 94fb1a5

37 files changed

+2209
-199
lines changed

build.backend.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ else
1010
FLAVOR=""
1111
fi
1212

13+
export SDKROOT=$(xcrun --sdk macosx --show-sdk-path)
14+
1315
if [[ "$FLAVOR" == "-prod" ]]; then
1416
cat <<EOF
1517
###

examples/run/run.cpp

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -985,13 +985,11 @@ static inline void stop_timer(void) {
985985
}
986986

987987
static void show_timer(void) {
988-
//printe("[%15lld] ns\n", timer_total);
989-
long long ms = timer_total/1000000;
990-
long long itl = ms/timer_count;
991-
float speed = 1/((float)itl) * 1000;
992-
printe("INFO: generate: [%7lld] ms for %lld invokations | ITL %lldms | throughput = %.2f t/s\n", timer_total/1000000, timer_count, itl, speed);
988+
double ms = timer_total/1000000;
989+
double itl = ms/timer_count;
990+
double speed = 1/itl * 1000;
993991

994-
printe("INFO: generate: [%7lld] s\n", timer_total/1000000/1000);
992+
printe("LLAMA generate [%9.0f] ms for %4lld invocations | ITL %2.2f ms | throughput = %4.2f t/s\n", ms, timer_count, itl, speed);
995993
}
996994

997995

@@ -1011,14 +1009,7 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
10111009
llama_batch batch = llama_batch_get_one(tokens.data(), tokens.size());
10121010
llama_token new_token_id;
10131011

1014-
int count = 0;
10151012
while (true) {
1016-
#if 0
1017-
if (count > 25) {
1018-
printe("WARNING: stopping after %d tokens", count);
1019-
break;
1020-
}
1021-
#endif
10221013
start_timer();
10231014
check_context_size(llama_data.context, batch);
10241015
if (llama_decode(llama_data.context.get(), batch)) {
@@ -1042,7 +1033,6 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
10421033
// prepare the next batch with the sampled token
10431034
batch = llama_batch_get_one(&new_token_id, 1);
10441035
stop_timer();
1045-
count += 1;
10461036
}
10471037

10481038
printf(LOG_COL_DEFAULT);

ggml/src/ggml-metal/ggml-metal.m

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4485,9 +4485,53 @@ static void ggml_metal_encode_node(
44854485
}
44864486
}
44874487

4488+
long long timer_start;
4489+
long long timer_total;
4490+
long long timer_count;
4491+
4492+
static inline void start_timer(void) {
4493+
struct timespec ts;
4494+
clock_gettime(CLOCK_REALTIME, &ts); // Use CLOCK_MONOTONIC for elapsed time
4495+
timer_start = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec;
4496+
}
4497+
4498+
static inline void stop_timer(void) {
4499+
struct timespec ts;
4500+
clock_gettime(CLOCK_REALTIME, &ts); // Use CLOCK_MONOTONIC for elapsed time
4501+
long long timer_end = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec;
4502+
4503+
timer_total += (timer_end - timer_start);
4504+
timer_count += 1;
4505+
}
4506+
4507+
static void show_timer(void) {
4508+
double ms = timer_total/1000000;
4509+
double itl = ms/timer_count;
4510+
double speed = 1/itl * 1000;
4511+
4512+
printf("METAL compute_graph: [%9.0f] ms for %lld invokations | ITL %.2f ms | throughput = %.2f t/s\n",ms, timer_count, itl, speed);
4513+
4514+
timer_start = 0;
4515+
timer_total = 1; // to avoid re-registering
4516+
timer_count = 0;
4517+
}
4518+
4519+
static void show_timer_signal(int sig) {
4520+
GGML_UNUSED(sig);
4521+
show_timer();
4522+
}
4523+
44884524
static enum ggml_status ggml_metal_graph_compute(
44894525
ggml_backend_t backend,
44904526
struct ggml_cgraph * gf) {
4527+
4528+
if (timer_total == 0) {
4529+
signal(SIGUSR1, show_timer_signal); // kill -USR1 $(cat /tmp/krunkit.pid)
4530+
atexit(show_timer);
4531+
}
4532+
4533+
start_timer();
4534+
44914535
struct ggml_backend_metal_context * ctx = backend->context;
44924536
struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
44934537

@@ -4615,6 +4659,8 @@ static enum ggml_status ggml_metal_graph_compute(
46154659
}
46164660
}
46174661

4662+
stop_timer();
4663+
46184664
return GGML_STATUS_SUCCESS;
46194665
}
46204666

ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,14 @@
88

99
#include "shared/apir_backend.h"
1010

11+
struct timer_data graph_compute_timer = {0, 0, 0, "compute_timer"};
12+
1113
uint32_t
1214
backend_graph_compute(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
1315
UNUSED(ctx);
1416
UNUSED(enc);
1517

16-
start_timer();
18+
start_timer(&graph_compute_timer);
1719

1820
uint32_t shmem_res_id;
1921
vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id);
@@ -34,7 +36,7 @@ backend_graph_compute(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, stru
3436

3537
vn_encode_ggml_status(enc, &status);
3638

37-
stop_timer();
39+
stop_timer(&graph_compute_timer);
3840

3941
return 0;
4042
}

ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -60,30 +60,16 @@ backend_buffer_type_is_host(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec
6060
uint32_t
6161
backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
6262
UNUSED(ctx);
63-
#if APIR_ALLOC_FROM_HOST_PTR
64-
uint32_t shmem_res_id;
65-
vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id);
6663

67-
void *shmem_data = ctx->iface.get_shmem_ptr(ctx->virgl_ctx, shmem_res_id);
68-
if (!shmem_data) {
69-
FATAL("Couldn't get the shmem addr from virgl :/");
70-
}
71-
#else
7264
ggml_backend_buffer_type_t buft;
7365
buft = vn_decode_ggml_buffer_type(dec);
74-
#endif
66+
7567
size_t size;
7668
vn_decode_size_t(dec, &size);
7769

7870
ggml_backend_buffer_t buffer;
79-
#if APIR_ALLOC_FROM_HOST_PTR
80-
#define MAX_TENSOR_SIZE 323205120
81-
buffer = dev->iface.buffer_from_host_ptr(dev, shmem_data, size, MAX_TENSOR_SIZE);
8271

83-
vn_encode_ggml_buffer_type(enc, buffer->buft);
84-
#else
8572
buffer = buft->iface.alloc_buffer(buft, size);
86-
#endif
8773

8874
vn_encode_ggml_buffer(enc, buffer);
8975

ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66
#include "ggml-backend-impl.h"
77
#include "ggml-backend.h"
88

9+
struct timer_data get_tensor_timer = {0, 0, 0, "get_tensor"};
10+
struct timer_data set_tensor_timer = {0, 0, 0, "set_tensor"};
11+
912
uint32_t
1013
backend_buffer_get_base(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
1114
UNUSED(ctx);
@@ -23,6 +26,8 @@ backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec,
2326
UNUSED(ctx);
2427
UNUSED(enc);
2528

29+
start_timer(&set_tensor_timer);
30+
2631
ggml_backend_buffer_t buffer;
2732
buffer = vn_decode_ggml_buffer(dec);
2833

@@ -60,6 +65,8 @@ backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec,
6065

6166
buffer->iface.set_tensor(buffer, tensor, shmem_data, offset, size);
6267

68+
stop_timer(&set_tensor_timer);
69+
6370
return 0;
6471
}
6572

@@ -68,6 +75,8 @@ backend_buffer_get_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec,
6875
UNUSED(ctx);
6976
UNUSED(enc);
7077

78+
start_timer(&get_tensor_timer);
79+
7180
ggml_backend_buffer_t buffer;
7281
buffer = vn_decode_ggml_buffer(dec);
7382

@@ -94,6 +103,8 @@ backend_buffer_get_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec,
94103
UNUSED(tensor);
95104
buffer->iface.get_tensor(buffer, tensor, shmem_data, offset, size);
96105

106+
stop_timer(&get_tensor_timer);
107+
97108
return 0;
98109
}
99110

ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,3 +109,34 @@ backend_device_get_props(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, s
109109

110110
return 0;
111111
}
112+
113+
uint32_t
114+
backend_device_buffer_from_ptr(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
115+
UNUSED(ctx);
116+
UNUSED(dec);
117+
118+
uint32_t shmem_res_id;
119+
vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id);
120+
121+
void *shmem_ptr = ctx->iface.get_shmem_ptr(ctx->virgl_ctx, shmem_res_id);
122+
if (!shmem_ptr) {
123+
FATAL("Couldn't get the shmem addr from virgl :/");
124+
}
125+
126+
size_t size;
127+
vn_decode_size_t(dec, &size);
128+
size_t max_tensor_size;
129+
vn_decode_size_t(dec, &max_tensor_size);
130+
131+
ggml_backend_buffer_t buffer;
132+
buffer = dev->iface.buffer_from_host_ptr(dev, shmem_ptr, size, max_tensor_size);
133+
134+
vn_encode_ggml_buffer(enc, buffer);
135+
vn_encode_ggml_buffer_type(enc, buffer->buft);
136+
137+
if (buffer) {
138+
track_backend_buffer(buffer);
139+
}
140+
141+
return 0;
142+
}

ggml/src/ggml-remotingbackend/backend-dispatched.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,9 @@ uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p, void *ggml_ba
3131
dev = reg->iface.get_device(reg, 0);
3232
}
3333

34-
ggml_backend_t (* ggml_backend_fct)(void) = (ggml_backend_t (*)()) ggml_backend_init_fct_p;
34+
ggml_backend_t (* ggml_backend_fct)(int) = (ggml_backend_t (*)(int)) ggml_backend_init_fct_p;
3535

36-
bck = ggml_backend_fct();
36+
bck = ggml_backend_fct(0);
3737
if (!bck) {
3838
ERROR("%s: backend initialization failed :/", __func__);
3939
return APIR_BACKEND_INITIALIZE_BACKEND_FAILED;

ggml/src/ggml-remotingbackend/backend-dispatched.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ uint32_t backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decod
2727
uint32_t backend_device_supports_op(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
2828
uint32_t backend_device_get_buffer_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
2929
uint32_t backend_device_get_props(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
30+
uint32_t backend_device_buffer_from_ptr(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
3031

3132
/* buffer-type */
3233
uint32_t backend_buffer_type_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
@@ -57,6 +58,7 @@ static inline const char *backend_dispatch_command_name(ApirBackendCommandType t
5758
case APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP: return "backend_device_supports_op";
5859
case APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE: return "backend_get_buffer_type";
5960
case APIR_COMMAND_TYPE_DEVICE_GET_PROPS: return "backend_get_props";
61+
case APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR: return "backend_buffer_from_ptr";
6062

6163
/* buffer-type */
6264
case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME: return "backend_buffer_type_get_name";
@@ -88,6 +90,7 @@ static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATC
8890
[APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP] = backend_device_supports_op,
8991
[APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE] = backend_device_get_buffer_type,
9092
[APIR_COMMAND_TYPE_DEVICE_GET_PROPS] = backend_device_get_props,
93+
[APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR] = backend_device_buffer_from_ptr,
9194

9295
/* buffer-type */
9396
[APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME] = backend_buffer_type_get_name,

ggml/src/ggml-remotingbackend/backend.cpp

Lines changed: 33 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,10 @@
1010
#include "shared/apir_backend.h"
1111
#include "shared/venus_cs.h"
1212

13-
#define GGML_BACKEND_LIBRARY_PATH "/Users/kevinpouget/remoting/llama_cpp/build.remoting-backend/bin/libggml-metal.dylib"
14-
#define GGML_BACKEND_REG_FCT_NAME "ggml_backend_metal_reg"
15-
#define GGML_BACKEND_INIT_FCT_NAME "ggml_backend_metal_init"
13+
#define GGML_BACKEND_LIBRARY_PATH_ENV "APIR_LLAMA_CPP_GGML_LIBRARY_PATH"
14+
#define GGML_BACKEND_LIBRARY_REG_ENV "APIR_LLAMA_CPP_GGML_LIBRARY_REG"
15+
#define GGML_BACKEND_LIBRARY_INIT_ENV "APIR_LLAMA_CPP_GGML_LIBRARY_INIT"
16+
1617

1718
static void *backend_library_handle = NULL;
1819

@@ -28,8 +29,9 @@ extern "C" {
2829
dev->iface.get_memory(dev, &free, &total);
2930
WARNING("%s: free memory: %ld MB\n", __func__, (size_t) free/1024/1024);
3031

31-
show_timer();
32-
32+
show_timer(&graph_compute_timer);
33+
show_timer(&set_tensor_timer);
34+
show_timer(&get_tensor_timer);
3335
/* *** */
3436

3537
if (backend_library_handle) {
@@ -43,40 +45,54 @@ extern "C" {
4345
uint32_t apir_backend_initialize() {
4446
const char* dlsym_error;
4547

46-
INFO("%s: hello :wave: \\o/", __func__);
48+
const char* library_name = getenv(GGML_BACKEND_LIBRARY_PATH_ENV);
49+
const char* library_reg = getenv(GGML_BACKEND_LIBRARY_REG_ENV);
50+
const char* library_init = getenv(GGML_BACKEND_LIBRARY_INIT_ENV);
51+
52+
INFO("%s: loading %s (%s|%s)", __func__, library_name, library_reg, library_init);
4753

48-
backend_library_handle = dlopen(GGML_BACKEND_LIBRARY_PATH, RTLD_LAZY);
54+
if (!library_name) {
55+
ERROR("Cannot open library: env var '%s' not defined\n", GGML_BACKEND_LIBRARY_PATH_ENV);
56+
57+
return APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY;
58+
}
59+
60+
backend_library_handle = dlopen(library_name, RTLD_LAZY);
4961

5062
if (!backend_library_handle) {
5163
ERROR("Cannot open library: %s\n", dlerror());
5264

5365
return APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY;
5466
}
5567

56-
void *ggml_backend_reg_fct = dlsym(backend_library_handle, GGML_BACKEND_REG_FCT_NAME);
68+
if (!library_reg) {
69+
ERROR("Cannot register library: env var '%s' not defined\n", GGML_BACKEND_LIBRARY_REG_ENV);
70+
71+
return APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY;
72+
}
73+
74+
void *ggml_backend_reg_fct = dlsym(backend_library_handle, library_reg);
5775
dlsym_error = dlerror();
5876
if (dlsym_error) {
5977
ERROR("Cannot load symbol: %s\n", dlsym_error);
6078

6179
return APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS;
6280
}
6381

64-
void *ggml_backend_init_fct = dlsym(backend_library_handle, GGML_BACKEND_INIT_FCT_NAME);
82+
if (!library_init) {
83+
ERROR("Cannot initialize library: env var '%s' not defined\n", library_init);
84+
85+
return APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY;
86+
}
87+
88+
void *ggml_backend_init_fct = dlsym(backend_library_handle, library_init);
6589
dlsym_error = dlerror();
6690
if (dlsym_error) {
6791
ERROR("Cannot load symbol: %s\n", dlsym_error);
6892

6993
return APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS;
7094
}
7195

72-
INFO("#");
73-
#if APIR_ALLOC_FROM_HOST_PTR
74-
INFO("# USING ALLOC_FROM_HOST_PTR");
75-
#else
76-
INFO("# USING ALLOC_BUFFER");
77-
#endif
78-
INFO("#");
79-
8096
return backend_dispatch_initialize(ggml_backend_reg_fct, ggml_backend_init_fct);
8197
}
8298

0 commit comments

Comments
 (0)