Skip to content

Commit 484d5a0

Browse files
authored
Merge pull request #6 from kpouget/remoting
Remoting
2 parents 17dd28c + 4fa0b0a commit 484d5a0

34 files changed

+496
-140
lines changed

build.backend.sh

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,22 @@ rm -f READY_backend FAILED_backend
44
echo "int isatty(int fd) { return 1; }" | gcc -O2 -fpic -shared -ldl -o /tmp/isatty.so -xc -
55
export LD_PRELOAD=/tmp/isatty.so
66

7-
cmake --build ../build.remoting-backend --parallel 8 --target llama-run "$@"
7+
if [[ "${PERF_MODE:-}" ]]; then
8+
FLAVOR="-prod"
9+
else
10+
FLAVOR=""
11+
fi
12+
13+
if [[ "$FLAVOR" == "-prod" ]]; then
14+
cat <<EOF
15+
###
16+
### Building the prod flavor
17+
###
18+
EOF
19+
fi
20+
21+
WHAT="llama-run llama-bench"
22+
cmake --build ../build.remoting-backend$FLAVOR --parallel 8 --target $WHAT "$@"
823

924
if [[ $? == 0 ]]; then
1025
touch READY_backend

examples/run/run.cpp

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -965,6 +965,36 @@ static void print_word_and_concatenate_to_response(const std::string & piece, st
965965
response += piece;
966966
}
967967

968+
static long long timer_start = 0;
969+
static long long timer_total = 0;
970+
static long long timer_count = 0;
971+
972+
static inline void start_timer(void) {
973+
struct timespec ts;
974+
clock_gettime(CLOCK_REALTIME, &ts); // Use CLOCK_MONOTONIC for elapsed time
975+
timer_start = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec;
976+
}
977+
978+
static inline void stop_timer(void) {
979+
struct timespec ts;
980+
clock_gettime(CLOCK_REALTIME, &ts); // Use CLOCK_MONOTONIC for elapsed time
981+
long long timer_end = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec;
982+
983+
timer_total += (timer_end - timer_start);
984+
timer_count += 1;
985+
}
986+
987+
static void show_timer(void) {
988+
//printe("[%15lld] ns\n", timer_total);
989+
long long ms = timer_total/1000000;
990+
long long itl = ms/timer_count;
991+
float speed = 1/((float)itl) * 1000;
992+
printe("INFO: generate: [%7lld] ms for %lld invokations | ITL %lldms | throughput = %.2f t/s\n", timer_total/1000000, timer_count, itl, speed);
993+
994+
printe("INFO: generate: [%7lld] s\n", timer_total/1000000/1000);
995+
}
996+
997+
968998
// helper function to evaluate a prompt and generate a response
969999
static int generate(LlamaData & llama_data, const std::string & prompt, std::string & response) {
9701000
const llama_vocab * vocab = llama_model_get_vocab(llama_data.model.get());
@@ -974,10 +1004,22 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
9741004
return 1;
9751005
}
9761006

1007+
int cr = atexit(show_timer);
1008+
assert(cr == 0);
1009+
9771010
// prepare a batch for the prompt
9781011
llama_batch batch = llama_batch_get_one(tokens.data(), tokens.size());
9791012
llama_token new_token_id;
1013+
1014+
int count = 0;
9801015
while (true) {
1016+
#if 0
1017+
if (count > 25) {
1018+
printe("WARNING: stopping after %d tokens", count);
1019+
break;
1020+
}
1021+
#endif
1022+
start_timer();
9811023
check_context_size(llama_data.context, batch);
9821024
if (llama_decode(llama_data.context.get(), batch)) {
9831025
printe("failed to decode\n");
@@ -999,6 +1041,8 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
9991041

10001042
// prepare the next batch with the sampled token
10011043
batch = llama_batch_get_one(&new_token_id, 1);
1044+
stop_timer();
1045+
count += 1;
10021046
}
10031047

10041048
printf(LOG_COL_DEFAULT);
Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,15 @@
11
#include "shared/apir_backend.h"
22

3-
static inline apir_buffer_handle_t
3+
#define BUFFER_TO_HOST_HANDLE(name) ggml_buffer_to_apir_handle(name)
4+
5+
static inline apir_buffer_host_handle_t
46
ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) {
57
// in the backend, the buffer handle is the buffer pointer
6-
return (apir_buffer_handle_t) buffer;
8+
return (apir_buffer_host_handle_t) buffer;
9+
}
10+
11+
static inline apir_buffer_type_host_handle_t
12+
ggml_buffer_type_to_apir_handle(ggml_backend_buffer_type_t buft) {
13+
// in the backend, the buffer handle is the buffer pointer
14+
return (apir_buffer_type_host_handle_t) buft;
715
}

ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,15 @@
66
#include "ggml-backend-impl.h"
77
#include "ggml-backend.h"
88

9+
#include "shared/apir_backend.h"
10+
911
uint32_t
1012
backend_graph_compute(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
1113
UNUSED(ctx);
1214
UNUSED(enc);
1315

16+
start_timer();
17+
1418
uint32_t shmem_res_id;
1519
vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id);
1620

@@ -30,5 +34,7 @@ backend_graph_compute(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, stru
3034

3135
vn_encode_ggml_status(enc, &status);
3236

37+
stop_timer();
38+
3339
return 0;
3440
}

ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ uint32_t
1010
backend_buffer_type_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
1111
UNUSED(ctx);
1212
ggml_backend_buffer_type_t buft;
13-
buft = vn_decode_ggml_buft(dec);
13+
buft = vn_decode_ggml_buffer_type(dec);
1414

1515
const char *string = buft->iface.get_name(buft);
1616

@@ -25,7 +25,7 @@ uint32_t
2525
backend_buffer_type_get_alignment(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
2626
UNUSED(ctx);
2727
ggml_backend_buffer_type_t buft;
28-
buft = vn_decode_ggml_buft(dec);
28+
buft = vn_decode_ggml_buffer_type(dec);
2929

3030
size_t value = buft->iface.get_alignment(buft);
3131
vn_encode_size_t(enc, &value);
@@ -37,7 +37,7 @@ uint32_t
3737
backend_buffer_type_get_max_size(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
3838
UNUSED(ctx);
3939
ggml_backend_buffer_type_t buft;
40-
buft = vn_decode_ggml_buft(dec);
40+
buft = vn_decode_ggml_buffer_type(dec);
4141

4242
size_t value = buft->iface.get_max_size(buft);
4343
vn_encode_size_t(enc, &value);
@@ -49,7 +49,7 @@ uint32_t
4949
backend_buffer_type_is_host(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
5050
UNUSED(ctx);
5151
ggml_backend_buffer_type_t buft;
52-
buft = vn_decode_ggml_buft(dec);
52+
buft = vn_decode_ggml_buffer_type(dec);
5353

5454
bool is_host = buft->iface.is_host(buft);
5555
vn_encode_bool_t(enc, &is_host);
@@ -60,15 +60,32 @@ backend_buffer_type_is_host(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec
6060
uint32_t
6161
backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
6262
UNUSED(ctx);
63-
ggml_backend_buffer_type_t buft;
64-
buft = vn_decode_ggml_buft(dec);
63+
#if APIR_ALLOC_FROM_HOST_PTR
64+
uint32_t shmem_res_id;
65+
vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id);
6566

67+
void *shmem_data = ctx->iface.get_shmem_ptr(ctx->virgl_ctx, shmem_res_id);
68+
if (!shmem_data) {
69+
FATAL("Couldn't get the shmem addr from virgl :/");
70+
}
71+
#else
72+
ggml_backend_buffer_type_t buft;
73+
buft = vn_decode_ggml_buffer_type(dec);
74+
#endif
6675
size_t size;
6776
vn_decode_size_t(dec, &size);
6877

69-
ggml_backend_buffer_t buffer = buft->iface.alloc_buffer(buft, size);
70-
apir_buffer_handle_t *buffer_handle = (apir_buffer_handle_t *) buffer;
71-
vn_encode_ggml_buffer_handle(enc, buffer_handle);
78+
ggml_backend_buffer_t buffer;
79+
#if APIR_ALLOC_FROM_HOST_PTR
80+
#define MAX_TENSOR_SIZE 323205120
81+
buffer = dev->iface.buffer_from_host_ptr(dev, shmem_data, size, MAX_TENSOR_SIZE);
82+
83+
vn_encode_ggml_buffer_type(enc, buffer->buft);
84+
#else
85+
buffer = buft->iface.alloc_buffer(buft, size);
86+
#endif
87+
88+
vn_encode_ggml_buffer(enc, buffer);
7289

7390
if (buffer) {
7491
track_backend_buffer(buffer);

ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,6 @@ backend_buffer_get_base(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, st
1515
uintptr_t base = (uintptr_t) buffer->iface.get_base(buffer);
1616
vn_encode_uintptr_t(enc, &base);
1717

18-
//INFO("%s: send base %p\n", __func__, (void *) base);
19-
2018
return 0;
2119
}
2220

@@ -123,6 +121,11 @@ backend_buffer_free_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec,
123121
ggml_backend_buffer_t buffer;
124122
buffer = vn_decode_ggml_buffer(dec);
125123

124+
if (!untrack_backend_buffer(buffer)) {
125+
WARNING("%s: unknown buffer %p", (void *) buffer);
126+
return 1;
127+
}
128+
126129
buffer->iface.free_buffer(buffer);
127130

128131
return 0;

ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,8 +89,7 @@ backend_device_get_buffer_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *
8989

9090
ggml_backend_buffer_type_t bufft = dev->iface.get_buffer_type(dev);
9191

92-
apir_buffer_type_handle_t buft_handle = (apir_buffer_type_handle_t) bufft;
93-
vn_encode_apir_buffer_type_handle_t(enc, &buft_handle);
92+
vn_encode_ggml_buffer_type(enc, bufft);
9493

9594
return 0;
9695
}

ggml/src/ggml-remotingbackend/backend-dispatched.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@ ggml_backend_reg_t reg = NULL;
1212
ggml_backend_dev_t dev = NULL;
1313
ggml_backend_t bck = NULL;
1414

15+
long long timer_start = 0;
16+
long long timer_total = 0;
17+
long long timer_count = 0;
18+
1519
uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p, void *ggml_backend_init_fct_p) {
1620
if (reg != NULL) {
1721
FATAL("%s: already initialized :/", __func__);
@@ -35,5 +39,9 @@ uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p, void *ggml_ba
3539
return APIR_BACKEND_INITIALIZE_BACKEND_FAILED;
3640
}
3741

42+
size_t free, total;
43+
dev->iface.get_memory(dev, &free, &total);
44+
WARNING("%s: free memory: %ld MB\n", __func__, (size_t) free/1024/1024);
45+
3846
return APIR_BACKEND_INITIALIZE_SUCCESSS;
3947
}

ggml/src/ggml-remotingbackend/backend.cpp

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,20 @@ static void *backend_library_handle = NULL;
1818

1919
extern "C" {
2020
void apir_backend_deinit(void) {
21+
auto buffers = get_track_backend_buffers();
22+
for (const auto& buffer: buffers) {
23+
untrack_backend_buffer(buffer);
24+
buffer->iface.free_buffer(buffer);
25+
}
26+
27+
size_t free, total;
28+
dev->iface.get_memory(dev, &free, &total);
29+
WARNING("%s: free memory: %ld MB\n", __func__, (size_t) free/1024/1024);
30+
31+
show_timer();
32+
33+
/* *** */
34+
2135
if (backend_library_handle) {
2236
INFO("%s: The GGML backend library was loaded. Unloading it.", __func__);
2337
dlclose(backend_library_handle);
@@ -55,6 +69,14 @@ extern "C" {
5569
return APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS;
5670
}
5771

72+
INFO("#");
73+
#if APIR_ALLOC_FROM_HOST_PTR
74+
INFO("# USING ALLOC_FROM_HOST_PTR");
75+
#else
76+
INFO("# USING ALLOC_BUFFER");
77+
#endif
78+
INFO("#");
79+
5880
return backend_dispatch_initialize(ggml_backend_reg_fct, ggml_backend_init_fct);
5981
}
6082

@@ -81,6 +103,11 @@ extern "C" {
81103
return APIR_BACKEND_FORWARD_INDEX_INVALID;
82104
}
83105

106+
#if 0
107+
static long long count = 0;
108+
INFO("[%lld] Calling %s", count, backend_dispatch_command_name((ApirBackendCommandType) cmd_type));
109+
count += 1;
110+
#endif
84111
backend_dispatch_t forward_fct = apir_backend_dispatch_table[cmd_type];
85112
uint32_t ret = forward_fct(enc, dec, ctx);
86113

ggml/src/ggml-remotingbackend/shared/apir_backend.h

Lines changed: 40 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#pragma once
22

3-
#define APIR_LIBRARY_PATH "/Users/kevinpouget/remoting/llama_cpp/build.remoting-backend/bin/libggml-remotingbackend.dylib"
3+
#define APIR_LIBRARY_PATH "/Users/kevinpouget/remoting/llama_cpp/build.remoting-backend-prod/bin/libggml-remotingbackend.dylib"
44
#define APIR_INITIALIZE_FCT_NAME "apir_backend_initialize"
55
#define APIR_DEINIT_FCT_NAME "apir_backend_deinit"
66
#define APIR_DISPATCH_FCT_NAME "apir_backend_dispatcher"
@@ -14,8 +14,18 @@
1414

1515
#define APIR_BACKEND_FORWARD_INDEX_INVALID 6
1616

17-
typedef uintptr_t apir_buffer_type_handle_t;
18-
typedef uintptr_t apir_buffer_handle_t;
17+
#define APIR_ALLOC_FROM_HOST_PTR 0
18+
19+
typedef uintptr_t apir_buffer_type_host_handle_t;
20+
typedef uintptr_t apir_buffer_host_handle_t;
21+
22+
typedef struct {
23+
apir_buffer_host_handle_t host_handle;
24+
#if APIR_ALLOC_FROM_HOST_PTR
25+
struct vn_renderer_shmem *shmem;
26+
apir_buffer_type_host_handle_t buft_host_handle;
27+
#endif
28+
} apir_buffer_context_t;
1929

2030
typedef uint32_t (*apir_backend_initialize_t)(void);
2131
typedef void (*apir_backend_deinit_t)(void);
@@ -72,7 +82,30 @@ struct virgl_apir_context {
7282
struct virgl_apir_callbacks iface;
7383
};
7484

75-
#define TENSOR_MAX_DEPTH_DEVICE_SUPPORTS_OP 2
76-
#define TENSOR_MAX_DEPTH_BUFFER_GET_TENSOR 2
77-
#define TENSOR_MAX_DEPTH_BUFFER_SET_TENSOR 2
78-
#define TENSOR_MAX_DEPTH_CGRAPH_DATA 10
85+
extern long long timer_start;
86+
extern long long timer_total;
87+
extern long long timer_count;
88+
89+
static inline void start_timer(void) {
90+
struct timespec ts;
91+
clock_gettime(CLOCK_REALTIME, &ts); // Use CLOCK_MONOTONIC for elapsed time
92+
timer_start = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec;
93+
}
94+
95+
static inline void stop_timer(void) {
96+
struct timespec ts;
97+
clock_gettime(CLOCK_REALTIME, &ts); // Use CLOCK_MONOTONIC for elapsed time
98+
long long timer_end = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec;
99+
100+
timer_total += (timer_end - timer_start);
101+
timer_count += 1;
102+
}
103+
104+
static inline void show_timer(void) {
105+
long long ms = timer_total/1000000;
106+
long long itl = ms/timer_count;
107+
float speed = 1/((float)itl) * 1000;
108+
109+
INFO("compute_graph: [%9ld] ms for %ld invokations | ITL %lldms | throughput = %.2f t/s\n", timer_total/1000000, timer_count, itl, speed);
110+
INFO("compute_graph: [%9ld] s", (ms)/1000);
111+
}

0 commit comments

Comments
 (0)