@@ -283,7 +283,9 @@ StubLauncher::Launch()
283
283
// Push a dummy message to the message queue so that the stub
284
284
// process is notified that it can release the object stored in
285
285
// shared memory.
286
- stub_message_queue_->Push (DUMMY_MESSAGE);
286
+ if (stub_message_queue_) {
287
+ stub_message_queue_->Push (DUMMY_MESSAGE);
288
+ }
287
289
288
290
// If the model is not initialized, wait for the stub process to exit.
289
291
if (!is_initialized_) {
@@ -302,11 +304,23 @@ StubLauncher::Launch()
302
304
//
303
305
// The reason it is broken into two steps is that creation of the health
304
306
// monitoring thread may take longer which can make the server process think
305
- // that the stub process is unhealthy and return early. Waiting until the
306
- // health thread is spawn would make sure would prevent this issue.
307
- parent_message_queue_->Pop ();
307
+ // that the stub process is unhealthy and return early. Waiting with a longer
308
+ // timeout prevents this issue.
309
+ const uint64_t initialization_timeout_ms = 10000 ; // 10 sec
310
+ LOG_MESSAGE (
311
+ TRITONSERVER_LOG_VERBOSE,
312
+ " Waiting for the stub health monitoring thread to start" );
313
+
314
+ bi::managed_external_buffer::handle_t message;
315
+ auto err = ReceiveMessageFromStub (message, initialization_timeout_ms);
316
+ if (err != nullptr ) {
317
+ KillStubProcess ();
318
+ }
308
319
309
320
if (stub_process_kind_ == " AUTOCOMPLETE_STUB" ) {
321
+ if (err != nullptr ) {
322
+ throw BackendModelException (err);
323
+ }
310
324
try {
311
325
AutocompleteStubProcess ();
312
326
}
@@ -317,6 +331,7 @@ StubLauncher::Launch()
317
331
TRITONSERVER_ErrorNew (TRITONSERVER_ERROR_INTERNAL, ex.what ()));
318
332
}
319
333
} else if (stub_process_kind_ == " MODEL_INSTANCE_STUB" ) {
334
+ RETURN_IF_ERROR (err);
320
335
RETURN_IF_ERROR (ModelInstanceStubProcess ());
321
336
} else {
322
337
return TRITONSERVER_ErrorNew (
@@ -509,7 +524,9 @@ StubLauncher::Launch()
509
524
// Push a dummy message to the message queue so that the stub
510
525
// process is notified that it can release the object stored in
511
526
// shared memory.
512
- stub_message_queue_->Push (DUMMY_MESSAGE);
527
+ if (stub_message_queue_) {
528
+ stub_message_queue_->Push (DUMMY_MESSAGE);
529
+ }
513
530
514
531
// If the model is not initialized, wait for the stub process to exit.
515
532
if (!is_initialized_) {
@@ -528,11 +545,23 @@ StubLauncher::Launch()
528
545
//
529
546
// The reason it is broken into two steps is that creation of the health
530
547
// monitoring thread may take longer which can make the server process think
531
- // that the stub process is unhealthy and return early. Waiting until the
532
- // health thread is spawn would prevent this issue.
533
- parent_message_queue_->Pop ();
548
+ // that the stub process is unhealthy and return early. Waiting with a
549
+ // longer timeout prevents this issue.
550
+ const uint64_t initialization_timeout_ms = 10000 ; // 10 sec
551
+ LOG_MESSAGE (
552
+ TRITONSERVER_LOG_VERBOSE,
553
+ " Waiting for the stub health monitoring thread to start" );
554
+
555
+ bi::managed_external_buffer::handle_t message;
556
+ auto err = ReceiveMessageFromStub (message, initialization_timeout_ms);
557
+ if (err != nullptr ) {
558
+ KillStubProcess ();
559
+ }
534
560
535
561
if (stub_process_kind_ == " AUTOCOMPLETE_STUB" ) {
562
+ if (err != nullptr ) {
563
+ throw BackendModelException (err);
564
+ }
536
565
try {
537
566
AutocompleteStubProcess ();
538
567
}
@@ -543,6 +572,7 @@ StubLauncher::Launch()
543
572
TRITONSERVER_ErrorNew (TRITONSERVER_ERROR_INTERNAL, ex.what ()));
544
573
}
545
574
} else if (stub_process_kind_ == " MODEL_INSTANCE_STUB" ) {
575
+ RETURN_IF_ERROR (err);
546
576
RETURN_IF_ERROR (ModelInstanceStubProcess ());
547
577
} else {
548
578
return TRITONSERVER_ErrorNew (
@@ -663,8 +693,13 @@ StubLauncher::ModelInstanceStubProcess()
663
693
initialize_message->Args () = initialize_map_handle;
664
694
stub_message_queue_->Push (initialize_message->ShmHandle ());
665
695
696
+ const uint64_t initialization_timeout_ms = 5000 ; // 5 sec
697
+ LOG_MESSAGE (
698
+ TRITONSERVER_LOG_VERBOSE,
699
+ " Waiting for the stub process initialization response" );
700
+
666
701
bi::managed_external_buffer::handle_t message;
667
- RETURN_IF_ERROR (ReceiveMessageFromStub (message));
702
+ RETURN_IF_ERROR (ReceiveMessageFromStub (message, initialization_timeout_ms ));
668
703
669
704
std::unique_ptr<IPCMessage> initialize_response_message =
670
705
IPCMessage::LoadFromSharedMemory (shm_pool_, message);
@@ -797,11 +832,11 @@ StubLauncher::KillStubProcess()
797
832
798
833
TRITONSERVER_Error*
799
834
StubLauncher::ReceiveMessageFromStub (
800
- bi::managed_external_buffer::handle_t & message)
835
+ bi::managed_external_buffer::handle_t & message,
836
+ uint64_t timeout_miliseconds)
801
837
{
802
838
bool success = false ;
803
839
while (!success) {
804
- uint64_t timeout_miliseconds = 1000 ;
805
840
{
806
841
boost::posix_time::ptime timeout =
807
842
boost::get_system_time () +
0 commit comments