@@ -93,6 +93,7 @@ enum rpc_cmd {
9393 RPC_CMD_INIT_TENSOR,
9494 RPC_CMD_GET_ALLOC_SIZE,
9595 RPC_CMD_HELLO,
96+ RPC_CMD_LOAD_TENSOR,
9697 RPC_CMD_COUNT,
9798};
9899
@@ -161,6 +162,18 @@ struct rpc_msg_set_tensor_hash_rsp {
161162 uint8_t result;
162163};
163164
165+ struct rpc_msg_load_tensor_req {
166+ uint64_t model_hash;
167+ rpc_tensor tensor;
168+ uint64_t file_offset;
169+ uint64_t tensor_offset;
170+ uint64_t size;
171+ };
172+
173+ struct rpc_msg_load_tensor_rsp {
174+ uint8_t result;
175+ };
176+
164177struct rpc_msg_get_tensor_req {
165178 rpc_tensor tensor;
166179 uint64_t offset;
@@ -213,6 +226,24 @@ struct ggml_backend_rpc_buffer_context {
213226
214227// RPC helper functions
215228
229+ typedef uint64_t fnv_ctx_t ;
230+
231+ static void fnv_init (fnv_ctx_t * ctx) {
232+ *ctx = 0xcbf29ce484222325ULL ;
233+ }
234+
235+ static void fnv_update (fnv_ctx_t * ctx, const uint8_t * data, size_t len) {
236+ const uint64_t fnv_prime = 0x100000001b3ULL ;
237+ for (size_t i = 0 ; i < len; ++i) {
238+ *ctx ^= data[i];
239+ *ctx *= fnv_prime;
240+ }
241+ }
242+
243+ static void fnv_final (fnv_ctx_t * ctx, uint64_t * digest) {
244+ *digest = *ctx;
245+ }
246+
216247// Computes FNV-1a hash of the data
217248static uint64_t fnv_hash (const uint8_t * data, size_t len) {
218249 const uint64_t fnv_prime = 0x100000001b3ULL ;
@@ -225,6 +256,87 @@ static uint64_t fnv_hash(const uint8_t * data, size_t len) {
225256 return hash;
226257}
227258
259+ static bool get_model_hash_from_file (const char * model_file, uint64_t * hash) {
260+ // try loading the hash from model_file + '.rpc'
261+ std::string rpc_file = std::string (model_file) + " .rpc" ;
262+ // the hash file must exist, must be exactly 16 bytes and must be a valid hash written in hex
263+ if (!fs::exists (rpc_file)) {
264+ return false ;
265+ }
266+ std::ifstream file (rpc_file, std::ios::binary);
267+ if (!file.is_open ()) {
268+ return false ;
269+ }
270+ std::string hash_str;
271+ file.seekg (0 , std::ios::end);
272+ size_t file_size = file.tellg ();
273+ if (file_size != 16 ) {
274+ return false ;
275+ }
276+ file.seekg (0 , std::ios::beg);
277+ hash_str.resize (file_size);
278+ file.read (&hash_str[0 ], file_size);
279+ if ((size_t )file.gcount () != file_size) {
280+ return false ;
281+ }
282+ if (hash_str.find_first_not_of (" 0123456789abcdefABCDEF" ) != std::string::npos) {
283+ return false ;
284+ }
285+ *hash = std::stoull (hash_str, nullptr , 16 );
286+ return true ;
287+ }
288+
289+ static bool get_model_hash (const char * model_file, uint64_t * hash) {
290+ // model path -> (hash_exist, hash_value)
291+ static std::unordered_map<std::string, std::pair<bool , uint64_t >> model_hashes;
292+ if (model_hashes.find (model_file) != model_hashes.end ()) {
293+ *hash = model_hashes[model_file].second ;
294+ return model_hashes[model_file].first ;
295+ }
296+ if (get_model_hash_from_file (model_file, hash)) {
297+ model_hashes[model_file] = {true , *hash};
298+ return true ;
299+ }
300+ model_hashes[model_file] = {false , 0 };
301+ return false ;
302+ }
303+
304+ static bool generate_model_hash (const char * model_file, uint64_t * hash) {
305+ ggml_context * ctx = nullptr ;
306+ struct gguf_init_params params = {
307+ /* .no_alloc = */ false ,
308+ /* .ctx = */ &ctx,
309+ };
310+ gguf_context_ptr ctx_gguf { gguf_init_from_file (model_file, params) };
311+ if (!ctx_gguf) {
312+ return false ;
313+ }
314+ fnv_ctx_t fnv_ctx;
315+ fnv_init (&fnv_ctx);
316+ size_t data_offset = gguf_get_data_offset (ctx_gguf.get ());
317+ fnv_update (&fnv_ctx, (const uint8_t *)&data_offset, sizeof (data_offset));
318+ const int n_tensors = gguf_get_n_tensors (ctx_gguf.get ());
319+ for (int i = 0 ; i < n_tensors; ++i) {
320+ const char * name = gguf_get_tensor_name (ctx_gguf.get (), i);
321+ ggml_tensor * cur = ggml_get_tensor (ctx, name);
322+ auto n_bytes = ggml_nbytes (cur);
323+ fnv_update (&fnv_ctx, (const uint8_t *)cur->data , n_bytes);
324+ }
325+ fnv_final (&fnv_ctx, hash);
326+ // save the model hash to model_file + '.rpc' in hex format
327+ std::string hash_file = std::string (model_file) + " .rpc" ;
328+ std::ofstream file (hash_file, std::ios::binary);
329+ if (!file.is_open ()) {
330+ return false ;
331+ }
332+ file << std::hex << std::setfill (' 0' ) << std::setw (16 ) << *hash;
333+ if (!file) {
334+ return false ;
335+ }
336+ file.close ();
337+ return true ;
338+ }
339+
228340static std::shared_ptr<socket_t > make_socket (sockfd_t fd) {
229341#ifdef _WIN32
230342 if (fd == INVALID_SOCKET) {
@@ -605,6 +717,24 @@ static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con
605717 return response.result ;
606718}
607719
720+ static bool ggml_backend_rpc_buffer_load_tensor (ggml_backend_buffer_t buffer, ggml_tensor * tensor, const char * path, size_t file_offset, size_t tensor_offset, size_t size) {
721+ ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context ;
722+ uint64_t hash;
723+ if (!get_model_hash (path, &hash)) {
724+ return false ;
725+ }
726+ rpc_msg_load_tensor_req request;
727+ request.model_hash = hash;
728+ request.tensor = serialize_tensor (tensor);
729+ request.file_offset = file_offset;
730+ request.tensor_offset = tensor_offset;
731+ request.size = size;
732+ rpc_msg_load_tensor_rsp response;
733+ bool status = send_rpc_cmd (ctx->sock , RPC_CMD_LOAD_TENSOR, &request, sizeof (request), &response, sizeof (response));
734+ GGML_ASSERT (status);
735+ return response.result ;
736+ }
737+
608738static void ggml_backend_rpc_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value) {
609739 ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context ;
610740 rpc_msg_buffer_clear_req request = {ctx->remote_ptr , value};
@@ -854,8 +984,8 @@ void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, si
854984
855985class rpc_server {
856986public:
857- rpc_server (ggml_backend_t backend, const char * cache_dir)
858- : backend(backend), cache_dir(cache_dir) {
987+ rpc_server (ggml_backend_t backend, const std::unordered_map< uint64_t , std::string> & model_hashes, const char * cache_dir)
988+ : backend(backend), cache_dir(cache_dir), model_hashes(model_hashes) {
859989 }
860990 ~rpc_server ();
861991
@@ -868,6 +998,7 @@ class rpc_server {
868998 bool buffer_clear (const rpc_msg_buffer_clear_req & request);
869999 bool set_tensor (const std::vector<uint8_t > & input);
8701000 bool set_tensor_hash (const rpc_msg_set_tensor_hash_req & request, rpc_msg_set_tensor_hash_rsp & response);
1001+ bool load_tensor (const rpc_msg_load_tensor_req & request, rpc_msg_load_tensor_rsp & response);
8711002 bool get_tensor (const rpc_msg_get_tensor_req & request, std::vector<uint8_t > & response);
8721003 bool copy_tensor (const rpc_msg_copy_tensor_req & request, rpc_msg_copy_tensor_rsp & response);
8731004 bool graph_compute (const std::vector<uint8_t > & input, rpc_msg_graph_compute_rsp & response);
@@ -886,6 +1017,7 @@ class rpc_server {
8861017 ggml_backend_t backend;
8871018 const char * cache_dir;
8881019 std::unordered_set<ggml_backend_buffer_t > buffers;
1020+ const std::unordered_map<uint64_t , std::string> & model_hashes;
8891021};
8901022
8911023void rpc_server::hello (rpc_msg_hello_rsp & response) {
@@ -1104,6 +1236,18 @@ bool rpc_server::get_cached_file(uint64_t hash, std::vector<uint8_t> & data) {
11041236 return true ;
11051237}
11061238
1239+ static bool read_model_data (const char * path, size_t file_offset, size_t size, std::vector<uint8_t > & data) {
1240+ FILE * f = fopen (path, " rb" );
1241+ if (f == nullptr ) {
1242+ return false ;
1243+ }
1244+ fseek (f, file_offset, SEEK_SET);
1245+ data.resize (size);
1246+ size_t read_size = fread (data.data (), 1 , size, f);
1247+ fclose (f);
1248+ return read_size == size;
1249+ }
1250+
11071251bool rpc_server::set_tensor_hash (const rpc_msg_set_tensor_hash_req & request, rpc_msg_set_tensor_hash_rsp & response)
11081252{
11091253 std::vector<uint8_t > cached_file;
@@ -1146,6 +1290,50 @@ bool rpc_server::set_tensor_hash(const rpc_msg_set_tensor_hash_req & request, rp
11461290 return true ;
11471291}
11481292
1293+ bool rpc_server::load_tensor (const rpc_msg_load_tensor_req & request, rpc_msg_load_tensor_rsp & response) {
1294+ if (model_hashes.find (request.model_hash ) == model_hashes.end ()) {
1295+ response.result = 0 ;
1296+ return true ;
1297+ }
1298+ std::string path = model_hashes.at (request.model_hash );
1299+ std::vector<uint8_t > model_data;
1300+ if (!read_model_data (path.c_str (), request.file_offset , request.size , model_data)) {
1301+ response.result = 0 ;
1302+ return true ;
1303+ }
1304+ struct ggml_init_params params {
1305+ /* .mem_size =*/ ggml_tensor_overhead(),
1306+ /* .mem_buffer =*/ NULL ,
1307+ /* .no_alloc =*/ true ,
1308+ };
1309+ ggml_context_ptr ctx_ptr { ggml_init (params) };
1310+ GGML_ASSERT (ctx_ptr != nullptr );
1311+ ggml_context * ctx = ctx_ptr.get ();
1312+ ggml_tensor * tensor = deserialize_tensor (ctx, &request.tensor );
1313+ if (tensor == nullptr ) {
1314+ GGML_LOG_ERROR (" [%s] error deserializing tensor\n " , __func__);
1315+ return false ;
1316+ }
1317+ GGML_PRINT_DEBUG (" [%s] buffer: %p, data: %p, offset: %" PRIu64 " , size: %zu\n " , __func__, (void *)tensor->buffer , tensor->data , request.tensor_offset , request.size );
1318+
1319+ // sanitize tensor->data
1320+ {
1321+ const size_t p0 = (size_t ) ggml_backend_buffer_get_base (tensor->buffer );
1322+ const size_t p1 = p0 + ggml_backend_buffer_get_size (tensor->buffer );
1323+
1324+ if (request.tensor .data + request.tensor_offset < p0
1325+ || request.tensor .data + request.tensor_offset >= p1
1326+ || request.size > (p1 - request.tensor .data - request.tensor_offset )) {
1327+ GGML_LOG_ERROR (" [%s] tensor data region (data=0x%" PRIx64 " , offset=%" PRIu64 " , size=%" PRIu64 " ) out of buffer bounds [0x%zx, 0x%zx)\n " ,
1328+ __func__, request.tensor .data , request.tensor_offset , request.size , p0, p1);
1329+ return false ;
1330+ }
1331+ }
1332+ ggml_backend_tensor_set (tensor, model_data.data (), request.tensor_offset , request.size );
1333+ response.result = 1 ;
1334+ return true ;
1335+ }
1336+
11491337bool rpc_server::init_tensor (const rpc_msg_init_tensor_req & request) {
11501338 struct ggml_init_params params {
11511339 /* .mem_size =*/ ggml_tensor_overhead(),
@@ -1368,9 +1556,11 @@ rpc_server::~rpc_server() {
13681556 }
13691557}
13701558
1371- static void rpc_serve_client (ggml_backend_t backend, const char * cache_dir,
1559+ static void rpc_serve_client (ggml_backend_t backend,
1560+ const std::unordered_map<uint64_t , std::string> & model_hashes,
1561+ const char * cache_dir,
13721562 sockfd_t sockfd, size_t free_mem, size_t total_mem) {
1373- rpc_server server (backend, cache_dir);
1563+ rpc_server server (backend, model_hashes, cache_dir);
13741564 uint8_t cmd;
13751565 if (!recv_data (sockfd, &cmd, 1 )) {
13761566 return ;
@@ -1514,6 +1704,20 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
15141704 }
15151705 break ;
15161706 }
1707+ case RPC_CMD_LOAD_TENSOR: {
1708+ rpc_msg_load_tensor_req request;
1709+ if (!recv_msg (sockfd, &request, sizeof (request))) {
1710+ return ;
1711+ }
1712+ rpc_msg_load_tensor_rsp response;
1713+ if (!server.load_tensor (request, response)) {
1714+ return ;
1715+ }
1716+ if (!send_msg (sockfd, &response, sizeof (response))) {
1717+ return ;
1718+ }
1719+ break ;
1720+ }
15171721 case RPC_CMD_INIT_TENSOR: {
15181722 rpc_msg_init_tensor_req request;
15191723 if (!recv_msg (sockfd, &request,sizeof (request))) {
@@ -1590,7 +1794,7 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
15901794}
15911795
15921796void ggml_backend_rpc_start_server (ggml_backend_t backend, const char * endpoint,
1593- const char * cache_dir,
1797+ const char * model_file, const char * cache_dir,
15941798 size_t free_mem, size_t total_mem) {
15951799 printf (" Starting RPC server v%d.%d.%d\n " ,
15961800 RPC_PROTO_MAJOR_VERSION,
@@ -1600,6 +1804,21 @@ void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint
16001804 printf (" local cache : %s\n " , cache_dir ? cache_dir : " n/a" );
16011805 printf (" backend memory : %zu MB\n " , free_mem / (1024 * 1024 ));
16021806
1807+ std::unordered_map<uint64_t , std::string> model_hashes;
1808+ if (model_file != nullptr ) {
1809+ uint64_t model_hash;
1810+ if (!get_model_hash (model_file, &model_hash)) {
1811+ printf (" Generating model hash for file: %s\n " , model_file);
1812+ if (!generate_model_hash (model_file, &model_hash)) {
1813+ fprintf (stderr, " Failed to generate model hash for file: %s\n " , model_file);
1814+ return ;
1815+ }
1816+ }
1817+ printf (" model file : %s\n " , model_file);
1818+ printf (" model hash : %" PRIx64 " \n " , model_hash);
1819+ model_hashes[model_hash] = model_file;
1820+ }
1821+
16031822 std::string host;
16041823 int port;
16051824 if (!parse_endpoint (endpoint, host, port)) {
@@ -1628,7 +1847,7 @@ void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint
16281847 }
16291848 printf (" Accepted client connection, free_mem=%zu, total_mem=%zu\n " , free_mem, total_mem);
16301849 fflush (stdout);
1631- rpc_serve_client (backend, cache_dir, client_socket->fd , free_mem, total_mem);
1850+ rpc_serve_client (backend, model_hashes, cache_dir, client_socket->fd , free_mem, total_mem);
16321851 printf (" Client connection closed\n " );
16331852 fflush (stdout);
16341853 }
@@ -1762,6 +1981,9 @@ static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const ch
17621981 if (std::strcmp (name, " ggml_backend_rpc_start_server" ) == 0 ) {
17631982 return (void *)ggml_backend_rpc_start_server;
17641983 }
1984+ if (std::strcmp (name, " ggml_backend_tensor_load" ) == 0 ) {
1985+ return (void *)ggml_backend_rpc_buffer_load_tensor;
1986+ }
17651987 return NULL ;
17661988
17671989 GGML_UNUSED (reg);
0 commit comments