@@ -110,9 +110,9 @@ struct rpc_msg_init_tensor_req {
110110 rpc_tensor tensor;
111111};
112112
113- struct rpc_msg_init_tensor_rsp {
114- uint8_t result; // success/failure
115- };
113+ // struct rpc_msg_init_tensor_rsp {
114+ // uint8_t result; // success/failure
115+ // };
116116
117117struct rpc_msg_alloc_buffer_req {
118118 uint64_t size;
@@ -479,16 +479,15 @@ static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
479479}
480480
481481static void ggml_backend_rpc_buffer_init_tensor (ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
482- // UNUSED(buffer);
483482 ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context ;
484483
484+ // CUDA backend on the server pads everything to 512 due to CUDA limitations.
485+ // Due to bandwidth constraints, we only call the server init tensor functions if necessary.
485486 if (ggml_is_quantized (tensor->type ) && (tensor->ne [0 ] % 512 != 0 )) {
486- // TODO: this check is due to MATRIX_ROW_PADDING in CUDA and should be generalized
487- // GGML_ASSERT(tensor->ne[0] % 512 == 0 && "unsupported quantized tensor");
488487 rpc_msg_init_tensor_req request;
488+
489489 request.tensor = serialize_tensor (tensor);
490490
491- // rpc_msg_init_tensor_rsp response;
492491 bool status = send_rpc_cmd (ctx->sock , RPC_CMD_INIT_TENSOR, &request, sizeof (request), nullptr , 0 );
493492 GGML_ASSERT (status);
494493 }
@@ -603,11 +602,13 @@ static size_t ggml_backend_rpc_get_max_size(ggml_backend_buffer_type_t buft) {
603602}
604603
605604static size_t ggml_backend_rpc_buffer_type_get_alloc_size (ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
605+ // See comments in init_tensor.
606606 if (ggml_is_quantized (tensor->type ) && (tensor->ne [0 ] % 512 != 0 )) {
607607 ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context ;
608608 auto sock = get_socket (buft_ctx->endpoint );
609609
610610 rpc_msg_get_alloc_size_req request;
611+
611612 request.tensor = serialize_tensor (tensor);
612613
613614 rpc_msg_get_alloc_size_rsp response;
@@ -812,41 +813,30 @@ class rpc_server {
812813};
813814
814815bool rpc_server::get_alloc_size (const rpc_msg_get_alloc_size_req & request, rpc_msg_get_alloc_size_rsp & response) {
815- ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type (backend);
816+ ggml_backend_buffer_type_t buft;
816817 struct ggml_init_params params {
817818 /* .mem_size =*/ ggml_tensor_overhead(),
818819 /* .mem_buffer =*/ NULL ,
819820 /* .no_alloc =*/ true ,
820821 };
822+
821823 struct ggml_context * ctx = ggml_init (params);
822824 ggml_tensor * tensor = deserialize_tensor (ctx, &request.tensor );
825+
823826 if (tensor == nullptr ) {
824- printf ( " Got nullptr \n " );
827+ fprintf (stderr, " Null tensor pointer passed to server get_alloc_size function. \n " );
825828 ggml_free (ctx);
826829 return false ;
827830 }
828831
829- printf (" Getting buft\n " );
830-
831- // ggml_backend_buffer_get_alloc_size(tensor->buffer,tensor)
832-
833- // if (tensor->buffer == nullptr) {
834- // printf("Got null buffer\n");
835- // response.alloc_size = 0;
836- // ggml_free(ctx);
837- // return true;
838- // }
832+ if (tensor->buffer == nullptr ) {
833+ // No buffer allocated.
834+ buft = ggml_backend_get_default_buffer_type (backend);
835+ } else {
836+ buft = tensor->buffer ->buft ;
837+ }
839838
840839 response.alloc_size = ggml_backend_buft_get_alloc_size (buft,tensor);
841- // Call the backend's buffer_type_get_alloc_size function
842- // ggml_backend_buffer_type_t buft = tensor->buffer->buft;
843- // if (buft && buft->iface.get_alloc_size) {
844- // printf("Called buffer type get alloc size\n");
845- // response.alloc_size = buft->iface.get_alloc_size(buft, tensor);
846- // } else {
847- // printf("Called ggml_nbytes");
848- // response.alloc_size = ggml_nbytes(tensor);
849- // }
850840
851841 ggml_free (ctx);
852842 return true ;
@@ -996,20 +986,17 @@ bool rpc_server::init_tensor(const rpc_msg_init_tensor_req & request) {
996986 struct ggml_context * ctx = ggml_init (params);
997987 ggml_tensor * tensor = deserialize_tensor (ctx, &request.tensor );
998988 if (tensor == nullptr ) {
999- printf ( " Null tensor\n " );
989+ fprintf (stderr, " Null tensor pointer passed to server init_tensor function. \n " );
1000990 ggml_free (ctx);
1001991 return false ;
1002992 }
1003993
1004- printf (" about to call buffer\n " );
1005-
1006- // ggml_backend_init_tensor
1007-
1008994 // Call the backend's buffer_init_tensor function
1009995 ggml_backend_buffer_t buffer = tensor->buffer ;
1010996 if (buffer && buffer->iface .init_tensor ) {
1011- printf (" Calling buffer iface function\n " );
1012997 buffer->iface .init_tensor (buffer, tensor);
998+ } else {
999+ fprintf (stderr," Null buffer for tensor passed to init_tensor function\n " );
10131000 }
10141001
10151002 ggml_free (ctx);
0 commit comments