3030#include < vector>
3131
3232#include " backend_config.h"
33- #include " backend_model_instance.h"
3433#include " dynamic_batch_scheduler.h"
3534#include " filesystem.h"
3635#include " model_config_utils.h"
@@ -165,7 +164,8 @@ TritonModel::Create(
165164 // Create and initialize the model.
166165 std::unique_ptr<TritonModel> local_model (new TritonModel (
167166 server, localized_model_dir, backend, min_compute_capability, version,
168- model_config, auto_complete_config));
167+ model_config, auto_complete_config, backend_cmdline_config_map,
168+ host_policy_map));
169169
170170 TritonModel* raw_local_model = local_model.get ();
171171
@@ -197,17 +197,7 @@ TritonModel::Create(
197197 // Initialize the model for Triton core usage
198198 RETURN_IF_ERROR (local_model->Init (is_config_provided));
199199
200- bool device_blocking = false ;
201- if (local_model->backend_ ->ExecutionPolicy () ==
202- TRITONBACKEND_EXECUTION_DEVICE_BLOCKING) {
203- if (model_config.has_sequence_batching ()) {
204- LOG_INFO << " Overriding execution policy to "
205- " \" TRITONBACKEND_EXECUTION_BLOCKING\" for sequence model \" "
206- << model_config.name () << " \" " ;
207- } else {
208- device_blocking = true ;
209- }
210- }
200+ RETURN_IF_ERROR (local_model->GetExecutionPolicy (model_config));
211201
212202 // Initalize the custom batching library for the model, if provided.
213203 if (model_config.has_sequence_batching ()) {
@@ -250,17 +240,71 @@ TritonModel::Create(
250240 }
251241 }
252242
253- // Create and initialize the model instances for this model.
254- RETURN_IF_ERROR (TritonModelInstance::CreateInstances (
243+ // Create or update the model instances for this model.
244+ RETURN_IF_ERROR (TritonModelInstance::SetInstances (
255245 raw_local_model, backend_cmdline_config_map, host_policy_map,
256- model_config, device_blocking));
246+ model_config));
247+ RETURN_IF_ERROR (local_model->CommitInstances ());
257248
258249 RETURN_IF_ERROR (local_model->SetConfiguredScheduler ());
259250
260251 *model = std::move (local_model);
261252 return Status::Success;
262253}
263254
255+ Status
256+ TritonModel::UpdateInstanceGroup (
257+ const inference::ModelConfig& new_model_config,
258+ std::unique_lock<std::mutex>* caller_lock)
259+ {
260+ // Generate normalized model config with new instance group.
261+ inference::ModelConfig model_config = config_;
262+ model_config.clear_instance_group ();
263+ model_config.mutable_instance_group ()->Add (
264+ new_model_config.instance_group ().begin (),
265+ new_model_config.instance_group ().end ());
266+ RETURN_IF_ERROR (NormalizeInstanceGroup (
267+ min_compute_capability_, backend_->BackendAttributes ().preferred_groups_ ,
268+ &model_config));
269+ RETURN_IF_ERROR (ValidateInstanceGroup (model_config, min_compute_capability_));
270+
271+ // Update the instances to the new config.
272+ caller_lock->unlock (); // allow inference while creating instances
273+ Status status = TritonModelInstance::SetInstances (
274+ this , backend_cmdline_config_map_, host_policy_map_, model_config);
275+ caller_lock->lock ();
276+ if (!status.IsOk ()) {
277+ return status;
278+ }
279+
280+ // At this point, the new model config is ready but not yet written into this
281+ // object. The 'caller_lock' is held, so 'model_lifecycle' will pause any new
282+ // inference request. It is safe to move forward and commit the change.
283+ RETURN_IF_ERROR (SetModelConfig (model_config));
284+ RETURN_IF_ERROR (CommitInstances ());
285+ RETURN_IF_ERROR (SetConfiguredScheduler ());
286+
287+ return Status::Success;
288+ }
289+
290+ Status
291+ TritonModel::GetExecutionPolicy (const inference::ModelConfig& model_config)
292+ {
293+ // Set 'device_blocking_'
294+ device_blocking_ = false ;
295+ if (backend_->ExecutionPolicy () == TRITONBACKEND_EXECUTION_DEVICE_BLOCKING) {
296+ if (model_config.has_sequence_batching ()) {
297+ LOG_INFO << " Overriding execution policy to "
298+ " \" TRITONBACKEND_EXECUTION_BLOCKING\" for sequence model \" "
299+ << model_config.name () << " \" " ;
300+ } else {
301+ device_blocking_ = true ;
302+ }
303+ }
304+
305+ return Status::Success;
306+ }
307+
264308Status
265309TritonModel::ResolveBackendConfigs (
266310 const triton::common::BackendCmdlineConfigMap& backend_cmdline_config_map,
@@ -323,21 +367,78 @@ TritonModel::SetBackendConfigDefaults(
323367 return Status::Success;
324368}
325369
370+ std::shared_ptr<TritonModelInstance>
371+ TritonModel::FindInstance (const TritonModelInstance::Signature& signature) const
372+ {
373+ // The search can be improved by introducing some gradient into comparing
374+ // signatures. One solution could be to use hash key. [FIXME: DLIS-4822]
375+ for (auto * instances : {&instances_, &passive_instances_}) {
376+ for (auto & instance : (*instances)) {
377+ if (instance->GetSignature () == signature) {
378+ return instance;
379+ }
380+ }
381+ }
382+ return std::shared_ptr<TritonModelInstance>();
383+ }
384+
326385Status
327- TritonModel::AddInstance (
328- std::unique_ptr <TritonModelInstance>&& instance, const bool passive)
386+ TritonModel::RegisterInstance (
387+ std::shared_ptr <TritonModelInstance>&& instance, const bool passive)
329388{
389+ instance->GetSignature ().DisableMatching ();
390+
330391 if (passive) {
331- passive_instance_group_map_[instance->GroupName ()].emplace_back (
332- std::move (instance));
392+ bg_passive_instances_.emplace_back (std::move (instance));
333393 } else {
334- instance_group_map_[instance->GroupName ()].emplace_back (
335- std::move (instance));
394+ bg_instances_.emplace_back (std::move (instance));
336395 }
337396
338397 return Status::Success;
339398}
340399
400+ Status
401+ TritonModel::CommitInstances ()
402+ {
403+ instances_.swap (bg_instances_);
404+ passive_instances_.swap (bg_passive_instances_);
405+ bg_instances_.clear ();
406+ bg_passive_instances_.clear ();
407+
408+ for (auto * instances : {&instances_, &passive_instances_}) {
409+ for (auto & instance : (*instances)) {
410+ instance->GetSignature ().EnableMatching ();
411+ }
412+ }
413+
414+ return Status::Success;
415+ }
416+
417+ std::vector<std::shared_ptr<TritonModelInstance>>
418+ TritonModel::GetInstancesByDevice (int32_t device_id) const
419+ {
420+ std::vector<std::shared_ptr<TritonModelInstance>> result;
421+ // Do not match passive instances, as they do not have a backend thread.
422+ // Do not match foreground instances, as backend threads cannot be updated.
423+ for (auto & instance : bg_instances_) {
424+ if (instance->DeviceId () == device_id) {
425+ result.push_back (instance);
426+ }
427+ }
428+ return result;
429+ }
430+
431+ Status
432+ TritonModel::SetSchedulerMutable (std::unique_ptr<Scheduler> scheduler)
433+ {
434+ if (scheduler_ != nullptr ) {
435+ LOG_VERBOSE (1 ) << " Replacing scheduler for model '" + config_.name () + " '" ;
436+ }
437+ scheduler_ = std::move (scheduler);
438+
439+ return Status::Success;
440+ }
441+
341442Status
342443TritonModel::UpdateModelConfig (
343444 const uint32_t config_version, TRITONSERVER_Message* updated_config_message)
@@ -443,7 +544,7 @@ TritonModel::SetConfiguredScheduler()
443544 0 /* max_queue_delay_microseconds */ , &scheduler));
444545 }
445546
446- return SetScheduler (std::move (scheduler));
547+ return SetSchedulerMutable (std::move (scheduler));
447548}
448549
449550Status
@@ -499,40 +600,20 @@ TritonModel::SetBatchingStrategy(const std::string& batch_libpath)
499600 return Status::Success;
500601}
501602
502- Status
503- TritonModel::Initialize ()
504- {
505- for (const auto & pair : instance_group_map_) {
506- for (const auto & instance : pair.second ) {
507- RETURN_IF_ERROR (instance->Initialize ());
508- }
509- }
510-
511- return Status::Success;
512- }
513-
514- Status
515- TritonModel::WarmUp ()
516- {
517- for (const auto & pair : instance_group_map_) {
518- for (const auto & instance : pair.second ) {
519- RETURN_IF_ERROR (instance->WarmUp ());
520- }
521- }
522-
523- return Status::Success;
524- }
525-
526603TritonModel::TritonModel (
527604 InferenceServer* server,
528605 const std::shared_ptr<LocalizedPath>& localized_model_dir,
529606 const std::shared_ptr<TritonBackend>& backend,
530607 const double min_compute_capability, const int64_t version,
531- const inference::ModelConfig& config, const bool auto_complete_config)
608+ const inference::ModelConfig& config, const bool auto_complete_config,
609+ const triton::common::BackendCmdlineConfigMap& backend_cmdline_config_map,
610+ const triton::common::HostPolicyCmdlineConfigMap& host_policy_map)
532611 : Model(
533612 min_compute_capability, localized_model_dir->Path (), version, config),
534613 server_(server), min_compute_capability_(min_compute_capability),
535614 auto_complete_config_(auto_complete_config),
615+ backend_cmdline_config_map_(backend_cmdline_config_map),
616+ host_policy_map_(host_policy_map), device_blocking_(false ),
536617 localized_model_dir_(localized_model_dir), backend_(backend),
537618 state_(nullptr )
538619{
@@ -556,8 +637,10 @@ TritonModel::~TritonModel()
556637
557638 // Explicitly delete/finalize all model instances before finalizing
558639 // the model itself.
559- instance_group_map_.clear ();
560- passive_instance_group_map_.clear ();
640+ instances_.clear ();
641+ passive_instances_.clear ();
642+ bg_instances_.clear ();
643+ bg_passive_instances_.clear ();
561644
562645 // Unregister itself from the rate limiter. Note this should happen
563646 // after all instances are destructed. Destrucing instances ensures
0 commit comments