@@ -93,6 +93,7 @@ enum rpc_cmd {
9393    RPC_CMD_INIT_TENSOR,
9494    RPC_CMD_GET_ALLOC_SIZE,
9595    RPC_CMD_HELLO,
96+     RPC_CMD_LOAD_TENSOR,
9697    RPC_CMD_COUNT,
9798};
9899
@@ -161,6 +162,18 @@ struct rpc_msg_set_tensor_hash_rsp {
161162    uint8_t  result;
162163};
163164
165+ struct  rpc_msg_load_tensor_req  {
166+     uint64_t  model_hash;
167+     rpc_tensor tensor;
168+     uint64_t  file_offset;
169+     uint64_t  tensor_offset;
170+     uint64_t  size;
171+ };
172+ 
173+ struct  rpc_msg_load_tensor_rsp  {
174+     uint8_t  result;
175+ };
176+ 
164177struct  rpc_msg_get_tensor_req  {
165178    rpc_tensor tensor;
166179    uint64_t  offset;
@@ -213,6 +226,24 @@ struct ggml_backend_rpc_buffer_context {
213226
214227//  RPC helper functions
215228
229+ typedef  uint64_t  fnv_ctx_t ;
230+ 
231+ static  void  fnv_init (fnv_ctx_t  * ctx) {
232+     *ctx = 0xcbf29ce484222325ULL ;
233+ }
234+ 
235+ static  void  fnv_update (fnv_ctx_t  * ctx, const  uint8_t  * data, size_t  len) {
236+     const  uint64_t  fnv_prime = 0x100000001b3ULL ;
237+     for  (size_t  i = 0 ; i < len; ++i) {
238+         *ctx ^= data[i];
239+         *ctx *= fnv_prime;
240+     }
241+ }
242+ 
243+ static  void  fnv_final (fnv_ctx_t  * ctx, uint64_t  * digest) {
244+     *digest = *ctx;
245+ }
246+ 
216247//  Computes FNV-1a hash of the data
217248static  uint64_t  fnv_hash (const  uint8_t  * data, size_t  len) {
218249    const  uint64_t  fnv_prime = 0x100000001b3ULL ;
@@ -225,6 +256,87 @@ static uint64_t fnv_hash(const uint8_t * data, size_t len) {
225256    return  hash;
226257}
227258
259+ static  bool  get_model_hash_from_file (const  char  * model_file, uint64_t  * hash) {
260+     //  try loading the hash from model_file + '.rpc'
261+     std::string rpc_file = std::string (model_file) + " .rpc"  ;
262+     //  the hash file must exist, must be exactly 16 bytes and must be a valid hash written in hex
263+     if  (!fs::exists (rpc_file)) {
264+         return  false ;
265+     }
266+     std::ifstream file (rpc_file, std::ios::binary);
267+     if  (!file.is_open ()) {
268+         return  false ;
269+     }
270+     std::string hash_str;
271+     file.seekg (0 , std::ios::end);
272+     size_t  file_size = file.tellg ();
273+     if  (file_size != 16 ) {
274+         return  false ;
275+     }
276+     file.seekg (0 , std::ios::beg);
277+     hash_str.resize (file_size);
278+     file.read (&hash_str[0 ], file_size);
279+     if  ((size_t )file.gcount () != file_size) {
280+         return  false ;
281+     }
282+     if  (hash_str.find_first_not_of (" 0123456789abcdefABCDEF"  ) != std::string::npos) {
283+         return  false ;
284+     }
285+     *hash = std::stoull (hash_str, nullptr , 16 );
286+     return  true ;
287+ }
288+ 
289+ static  bool  get_model_hash (const  char  * model_file, uint64_t  * hash) {
290+     //  model path -> (hash_exist, hash_value)
291+     static  std::unordered_map<std::string, std::pair<bool , uint64_t >> model_hashes;
292+     if  (model_hashes.find (model_file) != model_hashes.end ()) {
293+         *hash = model_hashes[model_file].second ;
294+         return  model_hashes[model_file].first ;
295+     }
296+     if  (get_model_hash_from_file (model_file, hash)) {
297+         model_hashes[model_file] = {true , *hash};
298+         return  true ;
299+     }
300+     model_hashes[model_file] = {false , 0 };
301+     return  false ;
302+ }
303+ 
304+ static  bool  generate_model_hash (const  char  * model_file, uint64_t  * hash) {
305+     ggml_context * ctx = nullptr ;
306+     struct  gguf_init_params  params = {
307+         /*  .no_alloc = */   false ,
308+         /*  .ctx      = */   &ctx,
309+     };
310+     gguf_context_ptr ctx_gguf { gguf_init_from_file (model_file, params) };
311+     if  (!ctx_gguf) {
312+         return  false ;
313+     }
314+     fnv_ctx_t  fnv_ctx;
315+     fnv_init (&fnv_ctx);
316+     size_t  data_offset = gguf_get_data_offset (ctx_gguf.get ());
317+     fnv_update (&fnv_ctx, (const  uint8_t *)&data_offset, sizeof (data_offset));
318+     const  int  n_tensors = gguf_get_n_tensors (ctx_gguf.get ());
319+     for  (int  i = 0 ; i < n_tensors; ++i) {
320+         const  char  * name = gguf_get_tensor_name (ctx_gguf.get (), i);
321+         ggml_tensor * cur = ggml_get_tensor (ctx, name);
322+         auto  n_bytes = ggml_nbytes (cur);
323+         fnv_update (&fnv_ctx, (const  uint8_t *)cur->data , n_bytes);
324+     }
325+     fnv_final (&fnv_ctx, hash);
326+     //  save the model hash to model_file + '.rpc' in hex format
327+     std::string hash_file = std::string (model_file) + " .rpc"  ;
328+     std::ofstream file (hash_file, std::ios::binary);
329+     if  (!file.is_open ()) {
330+         return  false ;
331+     }
332+     file << std::hex << std::setfill (' 0'  ) << std::setw (16 ) << *hash;
333+     if  (!file) {
334+         return  false ;
335+     }
336+     file.close ();
337+     return  true ;
338+ }
339+ 
228340static  std::shared_ptr<socket_t > make_socket (sockfd_t  fd) {
229341#ifdef  _WIN32
230342    if  (fd == INVALID_SOCKET) {
@@ -605,6 +717,24 @@ static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con
605717    return  response.result ;
606718}
607719
720+ bool  ggml_backend_rpc_buffer_load_tensor (ggml_backend_buffer_t  buffer, ggml_tensor * tensor, const  char  * path, size_t  file_offset, size_t  tensor_offset, size_t  size) {
721+     ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context ;
722+     uint64_t  hash;
723+     if  (!get_model_hash (path, &hash)) {
724+         return  false ;
725+     }
726+     rpc_msg_load_tensor_req request;
727+     request.model_hash  = hash;
728+     request.tensor  = serialize_tensor (tensor);
729+     request.file_offset  = file_offset;
730+     request.tensor_offset  = tensor_offset;
731+     request.size  = size;
732+     rpc_msg_load_tensor_rsp response;
733+     bool  status = send_rpc_cmd (ctx->sock , RPC_CMD_LOAD_TENSOR, &request, sizeof (request), &response, sizeof (response));
734+     GGML_ASSERT (status);
735+     return  response.result ;
736+ }
737+ 
608738static  void  ggml_backend_rpc_buffer_clear (ggml_backend_buffer_t  buffer, uint8_t  value) {
609739    ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context ;
610740    rpc_msg_buffer_clear_req request = {ctx->remote_ptr , value};
@@ -854,8 +984,8 @@ void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, si
854984
855985class  rpc_server  {
856986public: 
857-     rpc_server (ggml_backend_t  backend, const  char  * cache_dir)
858-         : backend(backend), cache_dir(cache_dir) {
987+     rpc_server (ggml_backend_t  backend, const  std::unordered_map< uint64_t , std::string> & model_hashes,  const   char  * cache_dir)
988+         : backend(backend), cache_dir(cache_dir), model_hashes(model_hashes)  {
859989    }
860990    ~rpc_server ();
861991
@@ -868,6 +998,7 @@ class rpc_server {
868998    bool  buffer_clear (const  rpc_msg_buffer_clear_req & request);
869999    bool  set_tensor (const  std::vector<uint8_t > & input);
8701000    bool  set_tensor_hash (const  rpc_msg_set_tensor_hash_req & request, rpc_msg_set_tensor_hash_rsp & response);
1001+     bool  load_tensor (const  rpc_msg_load_tensor_req & request, rpc_msg_load_tensor_rsp & response);
8711002    bool  get_tensor (const  rpc_msg_get_tensor_req & request, std::vector<uint8_t > & response);
8721003    bool  copy_tensor (const  rpc_msg_copy_tensor_req & request, rpc_msg_copy_tensor_rsp & response);
8731004    bool  graph_compute (const  std::vector<uint8_t > & input, rpc_msg_graph_compute_rsp & response);
@@ -886,6 +1017,7 @@ class rpc_server {
8861017    ggml_backend_t  backend;
8871018    const  char  * cache_dir;
8881019    std::unordered_set<ggml_backend_buffer_t > buffers;
1020+     const  std::unordered_map<uint64_t , std::string> & model_hashes;
8891021};
8901022
8911023void  rpc_server::hello (rpc_msg_hello_rsp & response) {
@@ -1104,6 +1236,18 @@ bool rpc_server::get_cached_file(uint64_t hash, std::vector<uint8_t> & data) {
11041236    return  true ;
11051237}
11061238
1239+ static  bool  read_model_data (const  char  * path, size_t  file_offset, size_t  size, std::vector<uint8_t > & data) {
1240+     FILE * f = fopen (path, " rb"  );
1241+     if  (f == nullptr ) {
1242+         return  false ;
1243+     }
1244+     fseek (f, file_offset, SEEK_SET);
1245+     data.resize (size);
1246+     size_t  read_size = fread (data.data (), 1 , size, f);
1247+     fclose (f);
1248+     return  read_size == size;
1249+ }
1250+ 
11071251bool  rpc_server::set_tensor_hash (const  rpc_msg_set_tensor_hash_req & request, rpc_msg_set_tensor_hash_rsp & response)
11081252{
11091253    std::vector<uint8_t > cached_file;
@@ -1146,6 +1290,50 @@ bool rpc_server::set_tensor_hash(const rpc_msg_set_tensor_hash_req & request, rp
11461290    return  true ;
11471291}
11481292
1293+ bool  rpc_server::load_tensor (const  rpc_msg_load_tensor_req & request, rpc_msg_load_tensor_rsp & response) {
1294+     if  (model_hashes.find (request.model_hash ) == model_hashes.end ()) {
1295+         response.result  = 0 ;
1296+         return  true ;
1297+     }
1298+     std::string path = model_hashes.at (request.model_hash );
1299+     std::vector<uint8_t > model_data;
1300+     if  (!read_model_data (path.c_str (), request.file_offset , request.size , model_data)) {
1301+         response.result  = 0 ;
1302+         return  true ;
1303+     }
1304+     struct  ggml_init_params  params {
1305+         /* .mem_size   =*/   ggml_tensor_overhead(),
1306+         /* .mem_buffer =*/   NULL ,
1307+         /* .no_alloc   =*/   true ,
1308+     };
1309+     ggml_context_ptr ctx_ptr { ggml_init (params) };
1310+     GGML_ASSERT (ctx_ptr != nullptr );
1311+     ggml_context * ctx = ctx_ptr.get ();
1312+     ggml_tensor * tensor = deserialize_tensor (ctx, &request.tensor );
1313+     if  (tensor == nullptr ) {
1314+         GGML_LOG_ERROR (" [%s] error deserializing tensor\n "  , __func__);
1315+         return  false ;
1316+     }
1317+     GGML_PRINT_DEBUG (" [%s] buffer: %p, data: %p, offset: %"   PRIu64 " , size: %zu\n "  , __func__, (void *)tensor->buffer , tensor->data , request.tensor_offset , request.size );
1318+ 
1319+     //  sanitize tensor->data
1320+     {
1321+         const  size_t  p0 = (size_t ) ggml_backend_buffer_get_base (tensor->buffer );
1322+         const  size_t  p1 = p0 + ggml_backend_buffer_get_size (tensor->buffer );
1323+ 
1324+         if  (request.tensor .data  + request.tensor_offset  < p0
1325+          || request.tensor .data  + request.tensor_offset  >= p1
1326+          || request.size  > (p1 - request.tensor .data  - request.tensor_offset )) {
1327+             GGML_LOG_ERROR (" [%s] tensor data region (data=0x%"   PRIx64 " , offset=%"   PRIu64 " , size=%"   PRIu64 " ) out of buffer bounds [0x%zx, 0x%zx)\n "  ,
1328+                            __func__, request.tensor .data , request.tensor_offset , request.size , p0, p1);
1329+             return  false ;
1330+         }
1331+     }
1332+     ggml_backend_tensor_set (tensor, model_data.data (), request.tensor_offset , request.size );
1333+     response.result  = 1 ;
1334+     return  true ;
1335+ }
1336+ 
11491337bool  rpc_server::init_tensor (const  rpc_msg_init_tensor_req & request) {
11501338    struct  ggml_init_params  params {
11511339        /* .mem_size   =*/   ggml_tensor_overhead(),
@@ -1368,9 +1556,11 @@ rpc_server::~rpc_server() {
13681556    }
13691557}
13701558
1371- static  void  rpc_serve_client (ggml_backend_t  backend, const  char  * cache_dir,
1559+ static  void  rpc_serve_client (ggml_backend_t  backend,
1560+                              const  std::unordered_map<uint64_t , std::string> & model_hashes,
1561+                              const  char  * cache_dir,
13721562                             sockfd_t  sockfd, size_t  free_mem, size_t  total_mem) {
1373-     rpc_server server (backend, cache_dir);
1563+     rpc_server server (backend, model_hashes,  cache_dir);
13741564    uint8_t  cmd;
13751565    if  (!recv_data (sockfd, &cmd, 1 )) {
13761566        return ;
@@ -1514,6 +1704,20 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
15141704                }
15151705                break ;
15161706            }
1707+             case  RPC_CMD_LOAD_TENSOR: {
1708+                 rpc_msg_load_tensor_req request;
1709+                 if  (!recv_msg (sockfd, &request, sizeof (request))) {
1710+                     return ;
1711+                 }
1712+                 rpc_msg_load_tensor_rsp response;
1713+                 if  (!server.load_tensor (request, response)) {
1714+                     return ;
1715+                 }
1716+                 if  (!send_msg (sockfd, &response, sizeof (response))) {
1717+                     return ;
1718+                 }
1719+                 break ;
1720+             }
15171721            case  RPC_CMD_INIT_TENSOR: {
15181722                rpc_msg_init_tensor_req request;
15191723                if  (!recv_msg (sockfd, &request,sizeof (request))) {
@@ -1590,7 +1794,7 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
15901794}
15911795
15921796void  ggml_backend_rpc_start_server (ggml_backend_t  backend, const  char  * endpoint,
1593-                                    const  char  * cache_dir,
1797+                                    const  char  * model_file,  const   char  *  cache_dir,
15941798                                   size_t  free_mem, size_t  total_mem) {
15951799    printf (" Starting RPC server v%d.%d.%d\n "  ,
15961800        RPC_PROTO_MAJOR_VERSION,
@@ -1600,6 +1804,21 @@ void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint
16001804    printf ("   local cache    : %s\n "  , cache_dir ? cache_dir : " n/a"  );
16011805    printf ("   backend memory : %zu MB\n "  , free_mem / (1024  * 1024 ));
16021806
1807+     std::unordered_map<uint64_t , std::string> model_hashes;
1808+     if  (model_file != nullptr ) {
1809+         uint64_t  model_hash;
1810+         if  (!get_model_hash (model_file, &model_hash)) {
1811+             printf (" Generating model hash for file: %s\n "  , model_file);
1812+             if  (!generate_model_hash (model_file, &model_hash)) {
1813+                 fprintf (stderr, " Failed to generate model hash for file: %s\n "  , model_file);
1814+                 return ;
1815+             }
1816+         }
1817+         printf ("   model file     : %s\n "  , model_file);
1818+         printf ("   model hash     : %"   PRIx64 " \n "  , model_hash);
1819+         model_hashes[model_hash] = model_file;
1820+     }
1821+ 
16031822    std::string host;
16041823    int  port;
16051824    if  (!parse_endpoint (endpoint, host, port)) {
@@ -1628,7 +1847,7 @@ void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint
16281847        }
16291848        printf (" Accepted client connection, free_mem=%zu, total_mem=%zu\n "  , free_mem, total_mem);
16301849        fflush (stdout);
1631-         rpc_serve_client (backend, cache_dir, client_socket->fd , free_mem, total_mem);
1850+         rpc_serve_client (backend, model_hashes,  cache_dir, client_socket->fd , free_mem, total_mem);
16321851        printf (" Client connection closed\n "  );
16331852        fflush (stdout);
16341853    }
@@ -1762,6 +1981,9 @@ static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const ch
17621981    if  (std::strcmp (name, " ggml_backend_rpc_start_server"  ) == 0 ) {
17631982        return  (void  *)ggml_backend_rpc_start_server;
17641983    }
1984+     if  (std::strcmp (name, " ggml_backend_rpc_buffer_load_tensor"  ) == 0 ) {
1985+         return  (void  *)ggml_backend_rpc_buffer_load_tensor;
1986+     }
17651987    return  NULL ;
17661988
17671989    GGML_UNUSED (reg);
0 commit comments