@@ -66,27 +66,27 @@ class NodeMigrationError(Exception):
6666
6767
6868def _monitor_pool_health (
69- manager : PoolManager , timeout : float , drained : Collection [ClusterNodeMetadata ], check_pods : bool = True
69+ manager : PoolManager , timeout : float , drained : Collection [ClusterNodeMetadata ], ignore_pod_health : bool = False
7070) -> bool :
7171 """Monitor pool health after nodes were submitted for draining
7272
7373 :param PoolManager manager: pool manager instance
7474 :param float timeout: timestamp after which giving up
7575 :param Collection[ClusterNodeMetadata] drained: nodes which were submitted for draining
76- :param bool check_pods: check that pods can successfully be scheduled
76+ :param bool ignore_pod_health: If set, do not check that pods can successfully be scheduled
7777 :return: true if capacity is fulfilled
7878 """
7979 draining_happened = False
8080 connector = cast (KubernetesClusterConnector , manager .cluster_connector )
8181 while time .time () < timeout :
82- manager .reload_state ()
82+ manager .reload_state (load_pods_info = not ignore_pod_health )
8383 draining_happened = draining_happened or not any (
8484 node .agent .agent_id == connector .get_agent_metadata (node .instance .ip_address ).agent_id for node in drained
8585 )
8686 if (
8787 draining_happened
8888 and manager .is_capacity_satisfied ()
89- and (not check_pods or connector .has_enough_capacity_for_pods ())
89+ and (ignore_pod_health or connector .has_enough_capacity_for_pods ())
9090 ):
9191 return True
9292 time .sleep (HEALTH_CHECK_INTERVAL_SECONDS )
@@ -114,7 +114,9 @@ def _drain_node_selection(
114114 logger .info (f"Recycling node { node .instance .instance_id } " )
115115 manager .submit_for_draining (node )
116116 time .sleep (worker_setup .bootstrap_wait )
117- if not _monitor_pool_health (manager , start_time + worker_setup .bootstrap_timeout , selection_chunk ):
117+ if not _monitor_pool_health (
118+ manager , start_time + worker_setup .bootstrap_timeout , selection_chunk , worker_setup .ignore_pod_health
119+ ):
118120 logger .warning (
119121 f"Pool { manager .cluster } :{ manager .pool } did not come back"
120122 " to desired capacity, stopping selection draining"
@@ -145,7 +147,7 @@ def uptime_migration_worker(
145147 else :
146148 logger .warning (f"Pool { cluster } :{ pool } is currently underprovisioned, skipping uptime migration iteration" )
147149 time .sleep (UPTIME_CHECK_INTERVAL_SECONDS )
148- manager .reload_state ()
150+ manager .reload_state (load_pods_info = not worker_setup . ignore_pod_health )
149151
150152
151153def event_migration_worker (migration_event : MigrationEvent , worker_setup : WorkerSetup , pool_lock : LockBase ) -> None :
@@ -158,7 +160,7 @@ def event_migration_worker(migration_event: MigrationEvent, worker_setup: Worker
158160 manager = PoolManager (migration_event .cluster , migration_event .pool , SUPPORTED_POOL_SCHEDULER , fetch_state = False )
159161 connector = cast (KubernetesClusterConnector , manager .cluster_connector )
160162 connector .set_label_selectors (migration_event .label_selectors , add_to_existing = True )
161- manager .reload_state ()
163+ manager .reload_state (load_pods_info = not worker_setup . ignore_pod_health )
162164 try :
163165 pool_lock .acquire (timeout = worker_setup .expected_duration )
164166 pool_lock_acquired = True
@@ -177,7 +179,10 @@ def event_migration_worker(migration_event: MigrationEvent, worker_setup: Worker
177179 prescaled_capacity = round (manager .target_capacity + (offset * avg_weight ))
178180 manager .modify_target_capacity (prescaled_capacity )
179181 if not _monitor_pool_health (
180- manager , time .time () + INITIAL_POOL_HEALTH_TIMEOUT_SECONDS , drained = [], check_pods = False
182+ manager ,
183+ time .time () + INITIAL_POOL_HEALTH_TIMEOUT_SECONDS ,
184+ drained = [],
185+ ignore_pod_health = True ,
181186 ):
182187 raise NodeMigrationError (f"Pool { migration_event .cluster } :{ migration_event .pool } is not healthy" )
183188 node_selector = lambda node : node .agent .agent_id and not migration_event .condition .matches (node ) # noqa
0 commit comments