@@ -280,7 +280,9 @@ StubLauncher::Launch()
280280    //  Push a dummy message to the message queue so that the stub
281281    //  process is notified that it can release the object stored in
282282    //  shared memory.
283-     stub_message_queue_->Push (DUMMY_MESSAGE);
283+     if  (stub_message_queue_) {
284+       stub_message_queue_->Push (DUMMY_MESSAGE);
285+     }
284286
285287    //  If the model is not initialized, wait for the stub process to exit.
286288    if  (!is_initialized_) {
@@ -299,11 +301,23 @@ StubLauncher::Launch()
299301  // 
300302  //  The reason it is broken into two steps is that creation of the health
301303  //  monitoring thread may take longer which can make the server process think
302-   //  that the stub process is unhealthy and return early. Waiting until the
303-   //  health thread is spawn would make sure would prevent this issue.
304-   parent_message_queue_->Pop ();
304+   //  that the stub process is unhealthy and return early. Waiting with a longer
305+   //  timeout prevents this issue.
306+   const  uint64_t  initialization_timeout_ms = 10000 ;  //  10 sec
307+   LOG_MESSAGE (
308+       TRITONSERVER_LOG_VERBOSE,
309+       " Waiting for the stub health monitoring thread to start"  );
310+ 
311+   bi::managed_external_buffer::handle_t  message;
312+   auto  err = ReceiveMessageFromStub (message, initialization_timeout_ms);
313+   if  (err != nullptr ) {
314+     KillStubProcess ();
315+   }
305316
306317  if  (stub_process_kind_ == " AUTOCOMPLETE_STUB"  ) {
318+     if  (err != nullptr ) {
319+       throw  BackendModelException (err);
320+     }
307321    try  {
308322      AutocompleteStubProcess ();
309323    }
@@ -314,6 +328,7 @@ StubLauncher::Launch()
314328          TRITONSERVER_ErrorNew (TRITONSERVER_ERROR_INTERNAL, ex.what ()));
315329    }
316330  } else  if  (stub_process_kind_ == " MODEL_INSTANCE_STUB"  ) {
331+     RETURN_IF_ERROR (err);
317332    RETURN_IF_ERROR (ModelInstanceStubProcess ());
318333  } else  {
319334    return  TRITONSERVER_ErrorNew (
@@ -435,7 +450,9 @@ StubLauncher::Launch()
435450      //  Push a dummy message to the message queue so that the stub
436451      //  process is notified that it can release the object stored in
437452      //  shared memory.
438-       stub_message_queue_->Push (DUMMY_MESSAGE);
453+       if  (stub_message_queue_) {
454+         stub_message_queue_->Push (DUMMY_MESSAGE);
455+       }
439456
440457      //  If the model is not initialized, wait for the stub process to exit.
441458      if  (!is_initialized_) {
@@ -456,11 +473,23 @@ StubLauncher::Launch()
456473    // 
457474    //  The reason it is broken into two steps is that creation of the health
458475    //  monitoring thread may take longer which can make the server process think
459-     //  that the stub process is unhealthy and return early. Waiting until the
460-     //  health thread is spawn would prevent this issue.
461-     parent_message_queue_->Pop ();
476+     //  that the stub process is unhealthy and return early. Waiting with a
477+     //  longer timeout prevents this issue.
478+     const  uint64_t  initialization_timeout_ms = 10000 ;  //  10 sec
479+     LOG_MESSAGE (
480+         TRITONSERVER_LOG_VERBOSE,
481+         " Waiting for the stub health monitoring thread to start"  );
482+ 
483+     bi::managed_external_buffer::handle_t  message;
484+     auto  err = ReceiveMessageFromStub (message, initialization_timeout_ms);
485+     if  (err != nullptr ) {
486+       KillStubProcess ();
487+     }
462488
463489    if  (stub_process_kind_ == " AUTOCOMPLETE_STUB"  ) {
490+       if  (err != nullptr ) {
491+         throw  BackendModelException (err);
492+       }
464493      try  {
465494        AutocompleteStubProcess ();
466495      }
@@ -471,6 +500,7 @@ StubLauncher::Launch()
471500            TRITONSERVER_ErrorNew (TRITONSERVER_ERROR_INTERNAL, ex.what ()));
472501      }
473502    } else  if  (stub_process_kind_ == " MODEL_INSTANCE_STUB"  ) {
503+       RETURN_IF_ERROR (err);
474504      RETURN_IF_ERROR (ModelInstanceStubProcess ());
475505    } else  {
476506      return  TRITONSERVER_ErrorNew (
@@ -592,8 +622,13 @@ StubLauncher::ModelInstanceStubProcess()
592622  initialize_message->Args () = initialize_map_handle;
593623  stub_message_queue_->Push (initialize_message->ShmHandle ());
594624
625+   const  uint64_t  initialization_timeout_ms = 5000 ;  //  5 sec
626+   LOG_MESSAGE (
627+       TRITONSERVER_LOG_VERBOSE,
628+       " Waiting for the stub process initialization response"  );
629+ 
595630  bi::managed_external_buffer::handle_t  message;
596-   RETURN_IF_ERROR (ReceiveMessageFromStub (message));
631+   RETURN_IF_ERROR (ReceiveMessageFromStub (message, initialization_timeout_ms ));
597632
598633  std::unique_ptr<IPCMessage> initialize_response_message =
599634      IPCMessage::LoadFromSharedMemory (shm_pool_, message);
@@ -726,11 +761,11 @@ StubLauncher::KillStubProcess()
726761
727762TRITONSERVER_Error*
728763StubLauncher::ReceiveMessageFromStub (
729-     bi::managed_external_buffer::handle_t & message)
764+     bi::managed_external_buffer::handle_t & message,
765+     uint64_t  timeout_miliseconds)
730766{
731767  bool  success = false ;
732768  while  (!success) {
733-     uint64_t  timeout_miliseconds = 1000 ;
734769    {
735770      boost::posix_time::ptime timeout =
736771          boost::get_system_time () +
0 commit comments