@@ -27,83 +27,106 @@ std::string server::host_name = "localhost";
2727int server::port = dmlc::GetEnv(" PORT" , 50051 );
2828std::string server::address = fmt::format(" {}:{}" , host_name, port);
2929
30+
3031static TShape to_shape (Shape shape) {
3132 auto dim = shape.dim ();
3233 TShape res (dim.begin (), dim.end ());
3334 return res;
3435}
3536
36- static void *get_device_ptr (const Layer &layer) {
37- const auto ipc_handle = layer.ipc_handle ();
38- if (ipc_handle == " " ) {
39- const auto msg = fmt::format (" unable to get device ptr from {}. make sure handle is not empty" , ipc_handle);
37+ static void *get_device_ptr_offset (const Layer &layer, void *devPtr) {
38+ const auto offset = layer.offset ();
39+ return (void *) (((char *) (devPtr)) + offset);
40+ }
41+
42+ static void *get_device_ptr (const std::string &handle_bytes) {
43+ if (handle_bytes == " " ) {
44+ const auto msg = fmt::format (" unable to get device ptr from {}. make sure handle is not empty" , handle_bytes);
4045 LOG (FATAL) << msg;
4146 throw dmlc::Error (msg);
4247 }
43-
4448 cudaIpcMemHandle_t handle;
45- memcpy ((uint8_t *) &handle, ipc_handle .c_str (), sizeof (handle));
49+ memcpy ((uint8_t *) &handle, handle_bytes .c_str (), sizeof (handle));
4650
47- // LOG(INFO) << "get handle = " << handle << "get base64 handle = " << utils::base64_encode(ipc_handle);
51+ void *device_ptr = nullptr ;
52+ CUDA_CHECK_CALL (cudaIpcOpenMemHandle ((void **) &device_ptr, handle, cudaIpcMemLazyEnablePeerAccess),
53+ fmt::format (" failed to open cuda ipc mem handle from {}" , utils::base64_encode (handle_bytes)));
4854
49- auto name = layer.name ();
55+ return device_ptr;
56+ }
5057
51- static const std::string arg_prefix (" arg:" );
52- if (string_starts_with (name, arg_prefix)) {
53- name.erase (0 , arg_prefix.size ());
54- }
55- static const std::string aux_prefix (" aux:" );
56- if (string_starts_with (name, aux_prefix)) {
57- name.erase (0 , aux_prefix.size ());
58- }
58+ static void *get_device_ptr (const Layer &layer) {
59+ auto name = layer.name ();
60+ const auto ipc_handle = layer.ipc_handle ();
5961
60- void *device_ptr;
61- auto span = start_span (" cudaIpcOpenMemHandle" , span_category_ipc, span_props{{" layer" , name},
62- {" byte_count" , std::to_string (layer.byte_count ())}});
63- CUDA_CHECK_CALL (cudaIpcOpenMemHandle ((void **) &device_ptr, handle, cudaIpcMemLazyEnablePeerAccess),
64- fmt::format (" failed to open cuda ipc mem handle from {}" , utils::base64_encode (ipc_handle)));
65- stop_span (span);
62+ static const std::string arg_prefix (" arg:" );
63+ if (string_starts_with (name, arg_prefix)) {
64+ name.erase (0 , arg_prefix.size ());
65+ }
66+ static const std::string aux_prefix (" aux:" );
67+ if (string_starts_with (name, aux_prefix)) {
68+ name.erase (0 , aux_prefix.size ());
69+ }
6670
67- // LOG(INFO) << "get device_ptr = " << device_ptr;
71+ auto span = start_span (" cudaIpcOpenMemHandle" ,
72+ span_category_ipc,
73+ span_props{{" layer" , name}, {" byte_count" , std::to_string (layer.byte_count ())}});
74+ auto device_ptr = get_device_ptr (ipc_handle.c_str ());
75+ stop_span (span);
6876
6977 return device_ptr;
7078}
7179
72- static void to_ndarray (std::vector<NDArray> *arrays, const Layer &layer) {
73- const auto ctx = get_ctx ();
74-
75- auto span = start_span (" to_nd_array" , span_category_serialization, span_props{{" layer" , layer.name ()}});
76- defer (stop_span (span));
77-
78- const auto shape = to_shape (layer.shape ());
80+ static void to_ndarrays (std::vector<NDArray> *arrays, std::vector<std::string> *keys, const ModelHandle &model_handle) {
81+ const auto ctx = get_ctx ();
7982 const auto dev_mask = ctx.dev_mask ();
8083 const auto dev_id = ctx.dev_id ;
8184
82- // LOG(INFO) << "in layer=" << layer.name() << " getting device ptr using ctx = " << ctx;
83-
84- auto device_ptr = get_device_ptr (layer);
85-
86- auto span_creating =
87- start_span (" creating_nd_array" , span_category_serialization, span_props{{" layer" , layer.name ()}});
88- defer (stop_span (span_creating));
89-
90- TBlob blob (device_ptr, shape, dev_mask, dev_id);
91- arrays->emplace_back (blob, dev_id, /* is_shared = */ true );
92-
93- return ;
94- }
95-
96- static void to_ndarrays (std::vector<NDArray> *arrays, std::vector<std::string> *keys, const ModelHandle &reply) {
97- const auto layers = reply.layer ();
85+ const auto layers = model_handle.layer ();
9886
9987 // LOG(INFO) << "got " << layers.size() << " layers form reply, before to_ndarray";
10088
101- for (const auto layer : layers) {
102- keys->emplace_back (layer.name ());
103- to_ndarray (arrays, layer);
89+ if (model_handle.sharing_granularity () == SharingGranularity_Model) {
90+ auto ipc_open_span = start_span (
91+ " cudaIpcOpenMemHandle" ,
92+ span_category_ipc,
93+ span_props{{" model" , model_handle.name ()}, {" byte_count" , std::to_string (model_handle.byte_count ())}});
94+ auto base_device_ptr = get_device_ptr (model_handle.ipc_handle ());
95+ stop_span (ipc_open_span);
96+
97+ for (const auto layer : layers) {
98+ // auto create_layer_span = start_span("to_nd_array",
99+ // span_category_serialization,
100+ // span_props{{"layer", layer.name()}, {"sharing_granularity", "model"}});
101+
102+ keys->emplace_back (layer.name ());
103+ const auto shape = to_shape (layer.shape ());
104+ auto device_ptr = get_device_ptr_offset (layer, base_device_ptr);
105+ TBlob blob (device_ptr, shape, dev_mask, dev_id);
106+ arrays->emplace_back (blob, dev_id, /* is_shared = */ true );
107+
108+ // stop_span(create_layer_span);
109+ }
110+ return ;
111+ }
112+ if (model_handle.sharing_granularity () == SharingGranularity_Model) {
113+ for (const auto layer : layers) {
114+ // auto create_layer_span = start_span("to_nd_array",
115+ // span_category_serialization,
116+ // span_props{{"layer", layer.name()}, {"sharing_granularity", "layer"}});
117+
118+ keys->emplace_back (layer.name ());
119+ const auto shape = to_shape (layer.shape ());
120+ auto device_ptr = get_device_ptr (layer);
121+ TBlob blob (device_ptr, shape, dev_mask, dev_id);
122+ arrays->emplace_back (blob, dev_id, /* is_shared = */ true );
123+
124+ // stop_span(create_layer_span);
125+ }
126+ return ;
104127 }
105128
106- // LOG(INFO) << "finished nd_array conversion" ;
129+ throw dmlc::Error ( " invalid granularity " ) ;
107130
108131 return ;
109132}
@@ -158,6 +181,14 @@ struct client {
158181 ModelHandle Open (const std::string &model_name) {
159182 ModelRequest request;
160183 request.set_name (model_name);
184+ if (UPR_SHARING_GRANULARITY == " model" ) {
185+ request.set_sharing_granularity (SharingGranularity_Model);
186+ } else if (UPR_SHARING_GRANULARITY == " layer" ) {
187+ request.set_sharing_granularity (SharingGranularity_Layer);
188+ } else {
189+ throw dmlc::Error (
190+ fmt::format (" Error: [{}] {}. failed to determine model granularity." , UPR_SHARING_GRANULARITY));
191+ }
161192 return this ->Open (request);
162193 }
163194
@@ -218,6 +249,7 @@ struct client {
218249 span_category_serialization,
219250 span_props{{" model_id" , open_reply.model_id ()},
220251 {" byte_count" , std::to_string (open_reply.byte_count ())},
252+ {" needed_eviction" , std::to_string (open_reply.needed_eviction ())},
221253 {" nlayers" , std::to_string (open_reply.layer ().size ())}});
222254 defer (stop_span (span_converting));
223255
@@ -247,5 +279,11 @@ void Unload(MXAPIPredictor *pred) {
247279 return ;
248280}
249281
282+ void initialize () {
283+ if (is_client && UPR_ENABLED) {
284+ client::get_connection ();
285+ }
286+ }
287+
250288} // namespace upr
251289#endif // MXNET_USE_CUDA
0 commit comments