Skip to content

Commit 0f2d163

Browse files
author
abdul dakkak
authored
Merge pull request #4 from rai-project/feature/persistent_granularity
Feature/persistent granularity
2 parents ffd91b2 + 2b744ca commit 0f2d163

File tree

19 files changed

+1320
-261
lines changed

19 files changed

+1320
-261
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ The server is part of the MXNet build process.
132132
| UPR_INPUT_MEAN_B | | 0 |
133133
| UPR_ENABLE_MEMORY_PROFILE | | false |
134134
| UPR_ENABLE_CUDA_FREE | | false |
135+
| UPR_SHARING_GRANULARITY | | model |
135136
| -------------------------- | ----------- | ------------- |
136137
| UPRD_EVICTION_POLICY | | LRU |
137138
| UPRD_ESTIMATION_RATE | | 1.0 |

config.mk

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ export NVCC = nvcc
4545
DEV = 0
4646

4747
# whether compile with debug
48-
DEBUG = 0
48+
0EBUG = 0
4949

5050
# whether compile with profiler
5151
USE_PROFILER =
@@ -174,7 +174,7 @@ USE_S3 = 0
174174
# performance settings
175175
#----------------------------
176176
# Use operator tuning
177-
USE_OPERATOR_TUNING = 1
177+
USE_OPERATOR_TUNING = 0
178178

179179
# Use gperftools if found
180180
USE_GPERFTOOLS = 1
@@ -225,7 +225,7 @@ USE_PROFILER=1
225225
# whether compile with options for MXNet developer
226226
DEV = 0
227227

228-
DEBUG = 1
228+
DEBUG = 0
229229
USE_GLOG=1
230230
USE_OPERATOR_TUNING = 0
231231
USE_OPENMP = 0

example/image-classification/predict-cpp/Makefile

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,16 +30,19 @@ CFLAGS+=-DUSE_CUDNN=1
3030
CFLAGS+=-DMSHADOW_USE_CUDNN=1
3131
CFLAGS+=-DNO_OPENMP=1
3232
CFLAGS+=-DUSE_CUDA=1
33+
CFLAGS+=-DUSE_GPERFTOOLS=1
34+
CFLAGS+=-Xcompiler -fno-builtin-malloc,-fno-builtin-calloc,-fno-builtin-realloc,-fno-builtin-free
3335
CFLAGS+=-DMXNET_USE_CUDA=1
34-
CFLAGS+= -Xcompiler -finstrument-functions
36+
#CFLAGS+= -Xcompiler -finstrument-functions
3537
LDFLAGS+=$(MXNET_ROOT)/lib/libmxnet.so -Xcompiler -finstrument-functions
38+
LDFLAGS+=-ltcmalloc
3639

3740
image-classification-predict: image-classification-predict.o
38-
nvcc -O3 -o image-classification-predict image-classification-predict.o $(LDFLAGS)
41+
nvcc -O3 -g -o image-classification-predict image-classification-predict.o $(LDFLAGS)
3942

4043
image-classification-predict.o: image-classification-predict.cc
4144
echo "CFLAGS = " $(CFLAGS)
42-
nvcc -O3 -c image-classification-predict.cc $(CFLAGS)
45+
nvcc -O3 -g -c image-classification-predict.cc $(CFLAGS)
4346

4447
clean:
4548
rm -f image-classification-predict

example/image-classification/predict-cpp/image-classification-predict.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -282,10 +282,10 @@ int main(int argc, char *argv[]) {
282282
MXSetProfilerState(0);
283283

284284
// // Synset path for your model, you have to modify it
285-
// std::vector<std::string> synset = LoadSynset(synset_file);
285+
// std::vector<std::string> synset = LoadSynset(synset_file);
286286

287287
// // Print Output Data
288-
// PrintOutputResult(data, size, synset);
288+
// PrintOutputResult(data, size, synset);
289289

290290
return 0;
291291
}

example/image-classification/predict-cpp/test.sh

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,14 @@ export GLOG_logtostderr=1
1111

1212
export UPR_ENABLED=true
1313
export UPR_CLIENT=1
14-
# export UPR_INITIALIZE_EAGER=true
14+
export UPR_INITIALIZE_EAGER=true
1515
# export UPR_ENABLE_MEMORY_PROFILE=true
1616

17-
UPR_MODEL_NAME=inception_3.0 ./image-classification-predict
18-
# UPR_MODEL_NAME=bvlc_googlenet_1.0 ./image-classification-predict
19-
# UPR_MODEL_NAME=bvlc_alexnet_1.0 ./image-classification-predict&
17+
#UPR_MODEL_NAME=inception_3.0 ./image-classification-predict
18+
#UPR_MODEL_NAME=bvlc_googlenet_1.0 ./image-classification-predict
19+
UPR_MODEL_NAME=vgg16_1.0 ./image-classification-predict
20+
#UPR_MODEL_NAME=squeezenet_1.0 ./image-classification-predict
21+
#UPR_MODEL_NAME=bvlc_alexnet_1.0 ./image-classification-predict
2022
# UPR_MODEL_NAME=bvlc_alexnet_1.0 ./image-classification-predict&
2123
# UPR_MODEL_NAME=bvlc_googlenet_1.0 ./image-classification-predict &
2224
# UPR_MODEL_NAME=bvlc_googlenet_1.0 ./image-classification-predict &

src/c_api/ipc.cc

Lines changed: 88 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -27,83 +27,106 @@ std::string server::host_name = "localhost";
2727
int server::port = dmlc::GetEnv("PORT", 50051);
2828
std::string server::address = fmt::format("{}:{}", host_name, port);
2929

30+
3031
static TShape to_shape(Shape shape) {
3132
auto dim = shape.dim();
3233
TShape res(dim.begin(), dim.end());
3334
return res;
3435
}
3536

36-
static void *get_device_ptr(const Layer &layer) {
37-
const auto ipc_handle = layer.ipc_handle();
38-
if (ipc_handle == "") {
39-
const auto msg = fmt::format("unable to get device ptr from {}. make sure handle is not empty", ipc_handle);
37+
static void *get_device_ptr_offset(const Layer &layer, void *devPtr) {
38+
const auto offset = layer.offset();
39+
return (void *) (((char *) (devPtr)) + offset);
40+
}
41+
42+
static void *get_device_ptr(const std::string &handle_bytes) {
43+
if (handle_bytes == "") {
44+
const auto msg = fmt::format("unable to get device ptr from {}. make sure handle is not empty", handle_bytes);
4045
LOG(FATAL) << msg;
4146
throw dmlc::Error(msg);
4247
}
43-
4448
cudaIpcMemHandle_t handle;
45-
memcpy((uint8_t *) &handle, ipc_handle.c_str(), sizeof(handle));
49+
memcpy((uint8_t *) &handle, handle_bytes.c_str(), sizeof(handle));
4650

47-
// LOG(INFO) << "get handle = " << handle << "get base64 handle = " << utils::base64_encode(ipc_handle);
51+
void *device_ptr = nullptr;
52+
CUDA_CHECK_CALL(cudaIpcOpenMemHandle((void **) &device_ptr, handle, cudaIpcMemLazyEnablePeerAccess),
53+
fmt::format("failed to open cuda ipc mem handle from {}", utils::base64_encode(handle_bytes)));
4854

49-
auto name = layer.name();
55+
return device_ptr;
56+
}
5057

51-
static const std::string arg_prefix("arg:");
52-
if (string_starts_with(name, arg_prefix)) {
53-
name.erase(0, arg_prefix.size());
54-
}
55-
static const std::string aux_prefix("aux:");
56-
if (string_starts_with(name, aux_prefix)) {
57-
name.erase(0, aux_prefix.size());
58-
}
58+
static void *get_device_ptr(const Layer &layer) {
59+
auto name = layer.name();
60+
const auto ipc_handle = layer.ipc_handle();
5961

60-
void *device_ptr;
61-
auto span = start_span("cudaIpcOpenMemHandle", span_category_ipc, span_props{{"layer", name},
62-
{"byte_count", std::to_string(layer.byte_count())}});
63-
CUDA_CHECK_CALL(cudaIpcOpenMemHandle((void **) &device_ptr, handle, cudaIpcMemLazyEnablePeerAccess),
64-
fmt::format("failed to open cuda ipc mem handle from {}", utils::base64_encode(ipc_handle)));
65-
stop_span(span);
62+
static const std::string arg_prefix("arg:");
63+
if (string_starts_with(name, arg_prefix)) {
64+
name.erase(0, arg_prefix.size());
65+
}
66+
static const std::string aux_prefix("aux:");
67+
if (string_starts_with(name, aux_prefix)) {
68+
name.erase(0, aux_prefix.size());
69+
}
6670

67-
// LOG(INFO) << "get device_ptr = " << device_ptr;
71+
auto span = start_span("cudaIpcOpenMemHandle",
72+
span_category_ipc,
73+
span_props{{"layer", name}, {"byte_count", std::to_string(layer.byte_count())}});
74+
auto device_ptr = get_device_ptr(ipc_handle.c_str());
75+
stop_span(span);
6876

6977
return device_ptr;
7078
}
7179

72-
static void to_ndarray(std::vector<NDArray> *arrays, const Layer &layer) {
73-
const auto ctx = get_ctx();
74-
75-
auto span = start_span("to_nd_array", span_category_serialization, span_props{{"layer", layer.name()}});
76-
defer(stop_span(span));
77-
78-
const auto shape = to_shape(layer.shape());
80+
static void to_ndarrays(std::vector<NDArray> *arrays, std::vector<std::string> *keys, const ModelHandle &model_handle) {
81+
const auto ctx = get_ctx();
7982
const auto dev_mask = ctx.dev_mask();
8083
const auto dev_id = ctx.dev_id;
8184

82-
// LOG(INFO) << "in layer=" << layer.name() << " getting device ptr using ctx = " << ctx;
83-
84-
auto device_ptr = get_device_ptr(layer);
85-
86-
auto span_creating =
87-
start_span("creating_nd_array", span_category_serialization, span_props{{"layer", layer.name()}});
88-
defer(stop_span(span_creating));
89-
90-
TBlob blob(device_ptr, shape, dev_mask, dev_id);
91-
arrays->emplace_back(blob, dev_id, /* is_shared = */ true);
92-
93-
return;
94-
}
95-
96-
static void to_ndarrays(std::vector<NDArray> *arrays, std::vector<std::string> *keys, const ModelHandle &reply) {
97-
const auto layers = reply.layer();
85+
const auto layers = model_handle.layer();
9886

9987
// LOG(INFO) << "got " << layers.size() << " layers form reply, before to_ndarray";
10088

101-
for (const auto layer : layers) {
102-
keys->emplace_back(layer.name());
103-
to_ndarray(arrays, layer);
89+
if (model_handle.sharing_granularity() == SharingGranularity_Model) {
90+
auto ipc_open_span = start_span(
91+
"cudaIpcOpenMemHandle",
92+
span_category_ipc,
93+
span_props{{"model", model_handle.name()}, {"byte_count", std::to_string(model_handle.byte_count())}});
94+
auto base_device_ptr = get_device_ptr(model_handle.ipc_handle());
95+
stop_span(ipc_open_span);
96+
97+
for (const auto layer : layers) {
98+
//auto create_layer_span = start_span("to_nd_array",
99+
// span_category_serialization,
100+
// span_props{{"layer", layer.name()}, {"sharing_granularity", "model"}});
101+
102+
keys->emplace_back(layer.name());
103+
const auto shape = to_shape(layer.shape());
104+
auto device_ptr = get_device_ptr_offset(layer, base_device_ptr);
105+
TBlob blob(device_ptr, shape, dev_mask, dev_id);
106+
arrays->emplace_back(blob, dev_id, /* is_shared = */ true);
107+
108+
//stop_span(create_layer_span);
109+
}
110+
return;
111+
}
112+
if (model_handle.sharing_granularity() == SharingGranularity_Model) {
113+
for (const auto layer : layers) {
114+
//auto create_layer_span = start_span("to_nd_array",
115+
// span_category_serialization,
116+
// span_props{{"layer", layer.name()}, {"sharing_granularity", "layer"}});
117+
118+
keys->emplace_back(layer.name());
119+
const auto shape = to_shape(layer.shape());
120+
auto device_ptr = get_device_ptr(layer);
121+
TBlob blob(device_ptr, shape, dev_mask, dev_id);
122+
arrays->emplace_back(blob, dev_id, /* is_shared = */ true);
123+
124+
//stop_span(create_layer_span);
125+
}
126+
return;
104127
}
105128

106-
// LOG(INFO) << "finished nd_array conversion";
129+
throw dmlc::Error("invalid granularity");
107130

108131
return;
109132
}
@@ -158,6 +181,14 @@ struct client {
158181
ModelHandle Open(const std::string &model_name) {
159182
ModelRequest request;
160183
request.set_name(model_name);
184+
if (UPR_SHARING_GRANULARITY == "model") {
185+
request.set_sharing_granularity(SharingGranularity_Model);
186+
} else if (UPR_SHARING_GRANULARITY == "layer") {
187+
request.set_sharing_granularity(SharingGranularity_Layer);
188+
} else {
189+
throw dmlc::Error(
190+
fmt::format("Error: [{}] {}. failed to determine model granularity.", UPR_SHARING_GRANULARITY));
191+
}
161192
return this->Open(request);
162193
}
163194

@@ -218,6 +249,7 @@ struct client {
218249
span_category_serialization,
219250
span_props{{"model_id", open_reply.model_id()},
220251
{"byte_count", std::to_string(open_reply.byte_count())},
252+
{"needed_eviction", std::to_string(open_reply.needed_eviction())},
221253
{"nlayers", std::to_string(open_reply.layer().size())}});
222254
defer(stop_span(span_converting));
223255

@@ -247,5 +279,11 @@ void Unload(MXAPIPredictor *pred) {
247279
return;
248280
}
249281

282+
void initialize() {
283+
if (is_client && UPR_ENABLED) {
284+
client::get_connection();
285+
}
286+
}
287+
250288
} // namespace upr
251289
#endif // MXNET_USE_CUDA

src/c_api/ipc.h

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ static const auto UPR_BASE_DIR = dmlc::GetEnv("UPR_BASE_DIR", HOME + std::stri
7878

7979
static const auto UPR_ENABLE_MEMORY_PROFILE = dmlc::GetEnv("UPR_ENABLE_MEMORY_PROFILE", false);
8080
static const auto UPR_ENABLE_CUDA_FREE = dmlc::GetEnv("UPR_ENABLE_CUDA_FREE", false);
81+
static const auto UPR_SHARING_GRANULARITY = dmlc::GetEnv("UPR_SHARING_GRANULARITY", std::string("model"));
8182

8283
static const auto UPRD_EVICTION_POLICY = dmlc::GetEnv("UPRD_EVICTION_POLICY", std::string("lru"));
8384
static const auto UPRD_ESTIMATION_RATE = dmlc::GetEnv("UPRD_ESTIMATION_RATE", 1.0);
@@ -239,8 +240,8 @@ static inline engine::OprExecStat *start_span(const std::string &name, std::stri
239240
#if MXNET_USE_PROFILER
240241
const auto ctx = get_ctx();
241242
auto opr_stat = engine::Profiler::Get()->AddOprStat(ctx.dev_type, ctx.dev_id, name);
242-
uint64_t tid = std::hash<std::thread::id>()(std::this_thread::get_id());
243-
engine::SetOprCategory(opr_stat, category);
243+
// uint64_t tid = std::hash<std::thread::id>()(std::this_thread::get_id());
244+
opr_stat->category = category;
244245
engine::SetOprStart(opr_stat);
245246
return opr_stat;
246247
#else
@@ -250,11 +251,13 @@ static inline engine::OprExecStat *start_span(const std::string &name, std::stri
250251

251252
static inline engine::OprExecStat *start_span(const std::string &name, std::string category, span_props props) {
252253
#if MXNET_USE_PROFILER
253-
auto span = start_span(name, category);
254-
for (const auto kv : props) {
255-
engine::AddOprMetadata(span, kv.first, kv.second);
256-
}
257-
return span;
254+
const auto ctx = get_ctx();
255+
auto opr_stat = engine::Profiler::Get()->AddOprStat(ctx.dev_type, ctx.dev_id, name);
256+
// uint64_t tid = std::hash<std::thread::id>()(std::this_thread::get_id());
257+
opr_stat->category = category;
258+
opr_stat->metadata = props;
259+
engine::SetOprStart(opr_stat);
260+
return opr_stat;
258261
#else
259262
return nullptr;
260263
#endif
@@ -443,5 +446,7 @@ void Unload(mxnet::MXAPIPredictor *pred);
443446

444447
std::pair<std::string, std::string> Load(std::string model_name, std::vector<mxnet::NDArray> *data,
445448
std::vector<std::string> *keys);
449+
450+
void initialize();
446451
} // namespace upr
447452
#endif // MXNET_USE_CUDA

0 commit comments

Comments
 (0)