@@ -280,7 +280,9 @@ StubLauncher::Launch()
280280 // Push a dummy message to the message queue so that the stub
281281 // process is notified that it can release the object stored in
282282 // shared memory.
283- stub_message_queue_->Push (DUMMY_MESSAGE);
283+ if (stub_message_queue_) {
284+ stub_message_queue_->Push (DUMMY_MESSAGE);
285+ }
284286
285287 // If the model is not initialized, wait for the stub process to exit.
286288 if (!is_initialized_) {
@@ -299,11 +301,23 @@ StubLauncher::Launch()
299301 //
300302 // The reason it is broken into two steps is that creation of the health
301303 // monitoring thread may take longer which can make the server process think
302- // that the stub process is unhealthy and return early. Waiting until the
303- // health thread is spawn would make sure would prevent this issue.
304- parent_message_queue_->Pop ();
304+ // that the stub process is unhealthy and return early. Waiting with a longer
305+ // timeout prevents this issue.
306+ const uint64_t initialization_timeout_ms = 10000 ; // 10 sec
307+ LOG_MESSAGE (
308+ TRITONSERVER_LOG_VERBOSE,
309+ " Waiting for the stub health monitoring thread to start" );
310+
311+ bi::managed_external_buffer::handle_t message;
312+ auto err = ReceiveMessageFromStub (message, initialization_timeout_ms);
313+ if (err != nullptr ) {
314+ KillStubProcess ();
315+ }
305316
306317 if (stub_process_kind_ == " AUTOCOMPLETE_STUB" ) {
318+ if (err != nullptr ) {
319+ throw BackendModelException (err);
320+ }
307321 try {
308322 AutocompleteStubProcess ();
309323 }
@@ -314,6 +328,7 @@ StubLauncher::Launch()
314328 TRITONSERVER_ErrorNew (TRITONSERVER_ERROR_INTERNAL, ex.what ()));
315329 }
316330 } else if (stub_process_kind_ == " MODEL_INSTANCE_STUB" ) {
331+ RETURN_IF_ERROR (err);
317332 RETURN_IF_ERROR (ModelInstanceStubProcess ());
318333 } else {
319334 return TRITONSERVER_ErrorNew (
@@ -435,7 +450,9 @@ StubLauncher::Launch()
435450 // Push a dummy message to the message queue so that the stub
436451 // process is notified that it can release the object stored in
437452 // shared memory.
438- stub_message_queue_->Push (DUMMY_MESSAGE);
453+ if (stub_message_queue_) {
454+ stub_message_queue_->Push (DUMMY_MESSAGE);
455+ }
439456
440457 // If the model is not initialized, wait for the stub process to exit.
441458 if (!is_initialized_) {
@@ -456,11 +473,23 @@ StubLauncher::Launch()
456473 //
457474 // The reason it is broken into two steps is that creation of the health
458475 // monitoring thread may take longer which can make the server process think
459- // that the stub process is unhealthy and return early. Waiting until the
460- // health thread is spawn would prevent this issue.
461- parent_message_queue_->Pop ();
476+ // that the stub process is unhealthy and return early. Waiting with a
477+ // longer timeout prevents this issue.
478+ const uint64_t initialization_timeout_ms = 10000 ; // 10 sec
479+ LOG_MESSAGE (
480+ TRITONSERVER_LOG_VERBOSE,
481+ " Waiting for the stub health monitoring thread to start" );
482+
483+ bi::managed_external_buffer::handle_t message;
484+ auto err = ReceiveMessageFromStub (message, initialization_timeout_ms);
485+ if (err != nullptr ) {
486+ KillStubProcess ();
487+ }
462488
463489 if (stub_process_kind_ == " AUTOCOMPLETE_STUB" ) {
490+ if (err != nullptr ) {
491+ throw BackendModelException (err);
492+ }
464493 try {
465494 AutocompleteStubProcess ();
466495 }
@@ -471,6 +500,7 @@ StubLauncher::Launch()
471500 TRITONSERVER_ErrorNew (TRITONSERVER_ERROR_INTERNAL, ex.what ()));
472501 }
473502 } else if (stub_process_kind_ == " MODEL_INSTANCE_STUB" ) {
503+ RETURN_IF_ERROR (err);
474504 RETURN_IF_ERROR (ModelInstanceStubProcess ());
475505 } else {
476506 return TRITONSERVER_ErrorNew (
@@ -592,8 +622,13 @@ StubLauncher::ModelInstanceStubProcess()
592622 initialize_message->Args () = initialize_map_handle;
593623 stub_message_queue_->Push (initialize_message->ShmHandle ());
594624
625+ const uint64_t initialization_timeout_ms = 5000 ; // 5 sec
626+ LOG_MESSAGE (
627+ TRITONSERVER_LOG_VERBOSE,
628+ " Waiting for the stub process initialization response" );
629+
595630 bi::managed_external_buffer::handle_t message;
596- RETURN_IF_ERROR (ReceiveMessageFromStub (message));
631+ RETURN_IF_ERROR (ReceiveMessageFromStub (message, initialization_timeout_ms ));
597632
598633 std::unique_ptr<IPCMessage> initialize_response_message =
599634 IPCMessage::LoadFromSharedMemory (shm_pool_, message);
@@ -726,11 +761,11 @@ StubLauncher::KillStubProcess()
726761
727762TRITONSERVER_Error*
728763StubLauncher::ReceiveMessageFromStub (
729- bi::managed_external_buffer::handle_t & message)
764+ bi::managed_external_buffer::handle_t & message,
765+ uint64_t timeout_miliseconds)
730766{
731767 bool success = false ;
732768 while (!success) {
733- uint64_t timeout_miliseconds = 1000 ;
734769 {
735770 boost::posix_time::ptime timeout =
736771 boost::get_system_time () +
0 commit comments