Skip to content
This repository was archived by the owner on Oct 16, 2024. It is now read-only.

Commit 9e1a795

Browse files
authored
Merge pull request #209 from Yelp/mpiano/CLUSTERMAN-691
2 parents ee8dec9 + f65c132 commit 9e1a795

File tree

8 files changed

+42
-27
lines changed

8 files changed

+42
-27
lines changed

clusterman/autoscaler/pool_manager.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,12 +84,13 @@ def __init__(
8484
if fetch_state:
8585
self.reload_state()
8686

87-
def reload_state(self) -> None:
87+
def reload_state(self, **cluster_connector_kwargs) -> None:
8888
"""Fetch any state that may have changed behind our back, but which we do not want to change during an
8989
``Autoscaler.run()``.
9090
"""
9191
logger.info("Reloading cluster connector state")
92-
self.cluster_connector.reload_state()
92+
# TODO: update mypy to avoid having to ignore this error (CLUSTERMAN-692)
93+
self.cluster_connector.reload_state(**cluster_connector_kwargs) # type: ignore
9394

9495
logger.info("Reloading resource groups")
9596
self._reload_resource_groups()

clusterman/kubernetes/kubernetes_cluster_connector.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -94,20 +94,23 @@ def __init__(self, cluster: str, pool: Optional[str], init_crd: bool = False) ->
9494
)
9595
self._label_selectors.append(f"{node_label_selector}={self.pool}")
9696

97-
def reload_state(self) -> None:
97+
def reload_state(self, load_pods_info: bool = True) -> None:
9898
logger.info("Reloading nodes")
9999

100100
self.reload_client()
101101

102102
# store the previous _nodes_by_ip for use in get_removed_nodes_before_last_reload()
103103
self._prev_nodes_by_ip = copy.deepcopy(self._nodes_by_ip)
104104
self._nodes_by_ip = self._get_nodes_by_ip()
105-
logger.info("Reloading pods")
106-
(self._pods_by_ip, self._unschedulable_pods, self._excluded_pods_by_ip,) = (
107-
self._get_pods_info_with_label()
108-
if self.pool_config.read_bool("use_labels_for_pods", default=False)
109-
else self._get_pods_info()
110-
)
105+
if load_pods_info:
106+
logger.info("Reloading pods")
107+
self._pods_by_ip, self._unschedulable_pods, self._excluded_pods_by_ip = (
108+
self._get_pods_info_with_label()
109+
if self.pool_config.read_bool("use_labels_for_pods", default=False)
110+
else self._get_pods_info()
111+
)
112+
else:
113+
self._pods_by_ip, self._unschedulable_pods, self._excluded_pods_by_ip = ({}, [], {})
111114

112115
def reload_client(self) -> None:
113116
self._core_api = CachedCoreV1Api(self.kubeconfig_path)

clusterman/migration/settings.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ class WorkerSetup(NamedTuple):
7878
bootstrap_timeout: float
7979
disable_autoscaling: bool
8080
expected_duration: float
81+
ignore_pod_health: bool = False
8182

8283
@classmethod
8384
def from_config(cls, config: dict) -> "WorkerSetup":
@@ -90,4 +91,5 @@ def from_config(cls, config: dict) -> "WorkerSetup":
9091
bootstrap_timeout=parse_time_interval_seconds(strat_conf.get("bootstrap_timeout", DEFAULT_NODE_BOOT_WAIT)),
9192
disable_autoscaling=config.get("disable_autoscaling", False),
9293
expected_duration=parse_time_interval_seconds(config.get("expected_duration", DEFAULT_WORKER_TIMEOUT)),
94+
ignore_pod_health=config.get("ignore_pod_health", False),
9395
)

clusterman/migration/worker.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -66,27 +66,27 @@ class NodeMigrationError(Exception):
6666

6767

6868
def _monitor_pool_health(
69-
manager: PoolManager, timeout: float, drained: Collection[ClusterNodeMetadata], check_pods: bool = True
69+
manager: PoolManager, timeout: float, drained: Collection[ClusterNodeMetadata], ignore_pod_health: bool = False
7070
) -> bool:
7171
"""Monitor pool health after nodes were submitted for draining
7272
7373
:param PoolManager manager: pool manager instance
7474
:param float timeout: timestamp after which giving up
7575
:param Collection[ClusterNodeMetadata] drained: nodes which were submitted for draining
76-
:param bool check_pods: check that pods can successfully be scheduled
76+
:param bool ignore_pod_health: If set, do not check that pods can successfully be scheduled
7777
:return: true if capacity is fulfilled
7878
"""
7979
draining_happened = False
8080
connector = cast(KubernetesClusterConnector, manager.cluster_connector)
8181
while time.time() < timeout:
82-
manager.reload_state()
82+
manager.reload_state(load_pods_info=not ignore_pod_health)
8383
draining_happened = draining_happened or not any(
8484
node.agent.agent_id == connector.get_agent_metadata(node.instance.ip_address).agent_id for node in drained
8585
)
8686
if (
8787
draining_happened
8888
and manager.is_capacity_satisfied()
89-
and (not check_pods or connector.has_enough_capacity_for_pods())
89+
and (ignore_pod_health or connector.has_enough_capacity_for_pods())
9090
):
9191
return True
9292
time.sleep(HEALTH_CHECK_INTERVAL_SECONDS)
@@ -114,7 +114,9 @@ def _drain_node_selection(
114114
logger.info(f"Recycling node {node.instance.instance_id}")
115115
manager.submit_for_draining(node)
116116
time.sleep(worker_setup.bootstrap_wait)
117-
if not _monitor_pool_health(manager, start_time + worker_setup.bootstrap_timeout, selection_chunk):
117+
if not _monitor_pool_health(
118+
manager, start_time + worker_setup.bootstrap_timeout, selection_chunk, worker_setup.ignore_pod_health
119+
):
118120
logger.warning(
119121
f"Pool {manager.cluster}:{manager.pool} did not come back"
120122
" to desired capacity, stopping selection draining"
@@ -145,7 +147,7 @@ def uptime_migration_worker(
145147
else:
146148
logger.warning(f"Pool {cluster}:{pool} is currently underprovisioned, skipping uptime migration iteration")
147149
time.sleep(UPTIME_CHECK_INTERVAL_SECONDS)
148-
manager.reload_state()
150+
manager.reload_state(load_pods_info=not worker_setup.ignore_pod_health)
149151

150152

151153
def event_migration_worker(migration_event: MigrationEvent, worker_setup: WorkerSetup, pool_lock: LockBase) -> None:
@@ -158,7 +160,7 @@ def event_migration_worker(migration_event: MigrationEvent, worker_setup: Worker
158160
manager = PoolManager(migration_event.cluster, migration_event.pool, SUPPORTED_POOL_SCHEDULER, fetch_state=False)
159161
connector = cast(KubernetesClusterConnector, manager.cluster_connector)
160162
connector.set_label_selectors(migration_event.label_selectors, add_to_existing=True)
161-
manager.reload_state()
163+
manager.reload_state(load_pods_info=not worker_setup.ignore_pod_health)
162164
try:
163165
pool_lock.acquire(timeout=worker_setup.expected_duration)
164166
pool_lock_acquired = True
@@ -177,7 +179,10 @@ def event_migration_worker(migration_event: MigrationEvent, worker_setup: Worker
177179
prescaled_capacity = round(manager.target_capacity + (offset * avg_weight))
178180
manager.modify_target_capacity(prescaled_capacity)
179181
if not _monitor_pool_health(
180-
manager, time.time() + INITIAL_POOL_HEALTH_TIMEOUT_SECONDS, drained=[], check_pods=False
182+
manager,
183+
time.time() + INITIAL_POOL_HEALTH_TIMEOUT_SECONDS,
184+
drained=[],
185+
ignore_pod_health=True,
181186
):
182187
raise NodeMigrationError(f"Pool {migration_event.cluster}:{migration_event.pool} is not healthy")
183188
node_selector = lambda node: node.agent.agent_id and not migration_event.condition.matches(node) # noqa

clusterman/simulator/simulated_pool_manager.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def __init__(
5656
MAX_MIN_NODE_SCALEIN_UPTIME_SECONDS,
5757
)
5858

59-
def reload_state(self) -> None:
59+
def reload_state(self, **cluster_connector_kwargs) -> None:
6060
pass
6161

6262
def get_node_metadatas(

docs/source/metrics.rst

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -85,17 +85,17 @@ cpus_allocated|cluster=norcal-prod,pool=appA_pool 1502405756 22
8585
mem_allocated|cluster=norcal-prod,pool=appB_pool 1502405810 20
8686
================================================= ========== =====
8787

88-
+---------------------------------------------------------------------------------------------------+-------------------------+-------------------------+
89-
| Metadata | | |
90-
+-----------------------------------------------------+------------+--------------------------------+-------------------------+-------------------------+
91-
| metric name | timestamp | value | <c3.xlarge, us-west-2a> | <c3.xlarge, us-west-2c> |
92-
+=====================================================+============+================================+=========================+=========================+
88+
+----------------------------------------------------------------------------+------------+--------------------------------+-------------------------+-------------------------+
89+
| Metadata | | | | |
90+
+----------------------------------------------------------------------------+------------+--------------------------------+-------------------------+-------------------------+
91+
| metric name | timestamp | value | <c3.xlarge, us-west-2a> | <c3.xlarge, us-west-2c> |
92+
+============================================================================+============+================================+=========================+=========================+
9393
| spot_prices|aws_availability_zone=us-west-2a,aws_instance_type=c3.xlarge | 1502405756 | 1.30 | | |
94-
+-----------------------------------------------------+------------+--------------------------------+-------------------------+-------------------------+
94+
+----------------------------------------------------------------------------+------------+--------------------------------+-------------------------+-------------------------+
9595
| spot_prices|aws_availability_zone=us-west-2c,aws_instance_type=c3.xlarge | 1502405756 | 5.27 | | |
96-
+-----------------------------------------------------+------------+--------------------------------+-------------------------+-------------------------+
97-
| fulfilled_capacity|cluster=norcal-prod,pool=seagull | 1502409314 | | 4 | 20 |
98-
+-----------------------------------------------------+------------+--------------------------------+-------------------------+-------------------------+
96+
+----------------------------------------------------------------------------+------------+--------------------------------+-------------------------+-------------------------+
97+
| fulfilled_capacity|cluster=norcal-prod,pool=seagull | 1502409314 | | 4 | 20 |
98+
+----------------------------------------------------------------------------+------------+--------------------------------+-------------------------+-------------------------+
9999

100100
.. _metric_name_reference:
101101

docs/source/node_migration.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ The allowed values for the migration settings are as follows:
4242

4343
* ``disable_autoscaling``: turn off autoscaler while recycling instances (false by default).
4444

45+
* ``ignore_pod_health``: avoid loading and checking pod information to determine pool health (false by default).
46+
4547
* ``expected_duration``: estimated duration for migration of the whole pool; human readable time string (1 day by default).
4648

4749
See :ref:`pool_configuration` for how an example configuration block would look like.

tests/migration/migration_worker_test.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,13 +108,15 @@ def test_drain_node_selection(mock_monitor, mock_time):
108108
ClusterNodeMetadata(AgentMetadata(agent_id=5, task_count=20), InstanceMetadata(None, None)),
109109
ClusterNodeMetadata(AgentMetadata(agent_id=4, task_count=22), InstanceMetadata(None, None)),
110110
],
111+
False,
111112
),
112113
call(
113114
mock_manager,
114115
3,
115116
[
116117
ClusterNodeMetadata(AgentMetadata(agent_id=3, task_count=24), InstanceMetadata(None, None)),
117118
],
119+
False,
118120
),
119121
]
120122
)

0 commit comments

Comments
 (0)