|
| 1 | +import datetime |
| 2 | +import logging |
1 | 3 | import re |
| 4 | +import time |
2 | 5 | from dataclasses import dataclass |
3 | 6 | from typing import Optional |
4 | 7 |
|
| 8 | +from databricks.sdk.errors import OperationFailed |
5 | 9 | from databricks.sdk.service import compute |
6 | 10 |
|
| 11 | +_LOG = logging.getLogger('databricks.sdk') |
| 12 | + |
7 | 13 |
|
8 | 14 | @dataclass |
9 | 15 | class SemVer: |
@@ -203,16 +209,28 @@ def select_node_type(self, |
203 | 209 | return nt.node_type_id |
204 | 210 | raise ValueError("cannot determine smallest node type") |
205 | 211 |
|
206 | | - def ensure_cluster_is_running(self, cluster_id: str): |
| 212 | + def ensure_cluster_is_running(self, cluster_id: str) -> None: |
207 | 213 | """Ensures that given cluster is running, regardless of the current state""" |
208 | | - state = compute.State |
209 | | - info = self.get(cluster_id) |
210 | | - if info.state == state.TERMINATED: |
211 | | - self.start(cluster_id).result() |
212 | | - elif info.state == state.TERMINATING: |
213 | | - self.wait_get_cluster_terminated(cluster_id) |
214 | | - self.start(cluster_id).result() |
215 | | - elif info.state in (state.PENDING, state.RESIZING, state.RESTARTING): |
216 | | - self.wait_get_cluster_running(cluster_id) |
217 | | - elif info.state in (state.ERROR, state.UNKNOWN): |
218 | | - raise RuntimeError(f'Cluster {info.cluster_name} is {info.state}: {info.state_message}') |
| 214 | + timeout = datetime.timedelta(minutes=20) |
| 215 | + deadline = time.time() + timeout.total_seconds() |
| 216 | + while time.time() < deadline: |
| 217 | + try: |
| 218 | + state = compute.State |
| 219 | + info = self.get(cluster_id) |
| 220 | + if info.state == state.RUNNING: |
| 221 | + return |
| 222 | + elif info.state == state.TERMINATED: |
| 223 | + self.start(cluster_id).result() |
| 224 | + return |
| 225 | + elif info.state == state.TERMINATING: |
| 226 | + self.wait_get_cluster_terminated(cluster_id) |
| 227 | + self.start(cluster_id).result() |
| 228 | + return |
| 229 | + elif info.state in (state.PENDING, state.RESIZING, state.RESTARTING): |
| 230 | + self.wait_get_cluster_running(cluster_id) |
| 231 | + return |
| 232 | + elif info.state in (state.ERROR, state.UNKNOWN): |
| 233 | + raise RuntimeError(f'Cluster {info.cluster_name} is {info.state}: {info.state_message}') |
| 234 | + except OperationFailed as e: |
| 235 | + _LOG.debug('Operation failed, retrying', exc_info=e) |
| 236 | + raise TimeoutError(f'timed out after {timeout}') |
0 commit comments