@@ -283,7 +283,9 @@ StubLauncher::Launch()
283283 // Push a dummy message to the message queue so that the stub
284284 // process is notified that it can release the object stored in
285285 // shared memory.
286- stub_message_queue_->Push (DUMMY_MESSAGE);
286+ if (stub_message_queue_) {
287+ stub_message_queue_->Push (DUMMY_MESSAGE);
288+ }
287289
288290 // If the model is not initialized, wait for the stub process to exit.
289291 if (!is_initialized_) {
@@ -302,11 +304,23 @@ StubLauncher::Launch()
302304 //
303305 // The reason it is broken into two steps is that creation of the health
304306 // monitoring thread may take longer which can make the server process think
305- // that the stub process is unhealthy and return early. Waiting until the
306- // health thread is spawn would make sure would prevent this issue.
307- parent_message_queue_->Pop ();
307+ // that the stub process is unhealthy and return early. Waiting with a longer
308+ // timeout prevents this issue.
309+ const uint64_t initialization_timeout_ms = 10000 ; // 10 sec
310+ LOG_MESSAGE (
311+ TRITONSERVER_LOG_VERBOSE,
312+ " Waiting for the stub health monitoring thread to start" );
313+
314+ bi::managed_external_buffer::handle_t message;
315+ auto err = ReceiveMessageFromStub (message, initialization_timeout_ms);
316+ if (err != nullptr ) {
317+ KillStubProcess ();
318+ }
308319
309320 if (stub_process_kind_ == " AUTOCOMPLETE_STUB" ) {
321+ if (err != nullptr ) {
322+ throw BackendModelException (err);
323+ }
310324 try {
311325 AutocompleteStubProcess ();
312326 }
@@ -317,6 +331,7 @@ StubLauncher::Launch()
317331 TRITONSERVER_ErrorNew (TRITONSERVER_ERROR_INTERNAL, ex.what ()));
318332 }
319333 } else if (stub_process_kind_ == " MODEL_INSTANCE_STUB" ) {
334+ RETURN_IF_ERROR (err);
320335 RETURN_IF_ERROR (ModelInstanceStubProcess ());
321336 } else {
322337 return TRITONSERVER_ErrorNew (
@@ -509,7 +524,9 @@ StubLauncher::Launch()
509524 // Push a dummy message to the message queue so that the stub
510525 // process is notified that it can release the object stored in
511526 // shared memory.
512- stub_message_queue_->Push (DUMMY_MESSAGE);
527+ if (stub_message_queue_) {
528+ stub_message_queue_->Push (DUMMY_MESSAGE);
529+ }
513530
514531 // If the model is not initialized, wait for the stub process to exit.
515532 if (!is_initialized_) {
@@ -528,11 +545,23 @@ StubLauncher::Launch()
528545 //
529546 // The reason it is broken into two steps is that creation of the health
530547 // monitoring thread may take longer which can make the server process think
531- // that the stub process is unhealthy and return early. Waiting until the
532- // health thread is spawn would prevent this issue.
533- parent_message_queue_->Pop ();
548+ // that the stub process is unhealthy and return early. Waiting with a
549+ // longer timeout prevents this issue.
550+ const uint64_t initialization_timeout_ms = 10000 ; // 10 sec
551+ LOG_MESSAGE (
552+ TRITONSERVER_LOG_VERBOSE,
553+ " Waiting for the stub health monitoring thread to start" );
554+
555+ bi::managed_external_buffer::handle_t message;
556+ auto err = ReceiveMessageFromStub (message, initialization_timeout_ms);
557+ if (err != nullptr ) {
558+ KillStubProcess ();
559+ }
534560
535561 if (stub_process_kind_ == " AUTOCOMPLETE_STUB" ) {
562+ if (err != nullptr ) {
563+ throw BackendModelException (err);
564+ }
536565 try {
537566 AutocompleteStubProcess ();
538567 }
@@ -543,6 +572,7 @@ StubLauncher::Launch()
543572 TRITONSERVER_ErrorNew (TRITONSERVER_ERROR_INTERNAL, ex.what ()));
544573 }
545574 } else if (stub_process_kind_ == " MODEL_INSTANCE_STUB" ) {
575+ RETURN_IF_ERROR (err);
546576 RETURN_IF_ERROR (ModelInstanceStubProcess ());
547577 } else {
548578 return TRITONSERVER_ErrorNew (
@@ -663,8 +693,13 @@ StubLauncher::ModelInstanceStubProcess()
663693 initialize_message->Args () = initialize_map_handle;
664694 stub_message_queue_->Push (initialize_message->ShmHandle ());
665695
696+ const uint64_t initialization_timeout_ms = 5000 ; // 5 sec
697+ LOG_MESSAGE (
698+ TRITONSERVER_LOG_VERBOSE,
699+ " Waiting for the stub process initialization response" );
700+
666701 bi::managed_external_buffer::handle_t message;
667- RETURN_IF_ERROR (ReceiveMessageFromStub (message));
702+ RETURN_IF_ERROR (ReceiveMessageFromStub (message, initialization_timeout_ms ));
668703
669704 std::unique_ptr<IPCMessage> initialize_response_message =
670705 IPCMessage::LoadFromSharedMemory (shm_pool_, message);
@@ -797,11 +832,11 @@ StubLauncher::KillStubProcess()
797832
798833TRITONSERVER_Error*
799834StubLauncher::ReceiveMessageFromStub (
800- bi::managed_external_buffer::handle_t & message)
835+ bi::managed_external_buffer::handle_t & message,
836+ uint64_t timeout_miliseconds)
801837{
802838 bool success = false ;
803839 while (!success) {
804- uint64_t timeout_miliseconds = 1000 ;
805840 {
806841 boost::posix_time::ptime timeout =
807842 boost::get_system_time () +
0 commit comments