Skip to content

Commit c1c0886

Browse files
judahrandnfx
andauthored
Added timeout to w.clusters.ensure_cluster_running() (#227)
## Changes This picks up where #118 left off. <!-- Summary of your changes that are easy to understand --> ## Tests <!-- How is this tested? Please see the checklist below and also describe any other relevant tests --> - [x] `make test` run locally - [x] `make fmt` applied - [x] relevant integration tests applied --------- Co-authored-by: Serge Smertin <[email protected]>
1 parent 3213b7b commit c1c0886

File tree

1 file changed

+30
-12
lines changed

1 file changed

+30
-12
lines changed

databricks/sdk/mixins/compute.py

Lines changed: 30 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,15 @@
1+
import datetime
2+
import logging
13
import re
4+
import time
25
from dataclasses import dataclass
36
from typing import Optional
47

8+
from databricks.sdk.errors import OperationFailed
59
from databricks.sdk.service import compute
610

11+
_LOG = logging.getLogger('databricks.sdk')
12+
713

814
@dataclass
915
class SemVer:
@@ -203,16 +209,28 @@ def select_node_type(self,
203209
return nt.node_type_id
204210
raise ValueError("cannot determine smallest node type")
205211

206-
def ensure_cluster_is_running(self, cluster_id: str):
212+
def ensure_cluster_is_running(self, cluster_id: str) -> None:
207213
"""Ensures that given cluster is running, regardless of the current state"""
208-
state = compute.State
209-
info = self.get(cluster_id)
210-
if info.state == state.TERMINATED:
211-
self.start(cluster_id).result()
212-
elif info.state == state.TERMINATING:
213-
self.wait_get_cluster_terminated(cluster_id)
214-
self.start(cluster_id).result()
215-
elif info.state in (state.PENDING, state.RESIZING, state.RESTARTING):
216-
self.wait_get_cluster_running(cluster_id)
217-
elif info.state in (state.ERROR, state.UNKNOWN):
218-
raise RuntimeError(f'Cluster {info.cluster_name} is {info.state}: {info.state_message}')
214+
timeout = datetime.timedelta(minutes=20)
215+
deadline = time.time() + timeout.total_seconds()
216+
while time.time() < deadline:
217+
try:
218+
state = compute.State
219+
info = self.get(cluster_id)
220+
if info.state == state.RUNNING:
221+
return
222+
elif info.state == state.TERMINATED:
223+
self.start(cluster_id).result()
224+
return
225+
elif info.state == state.TERMINATING:
226+
self.wait_get_cluster_terminated(cluster_id)
227+
self.start(cluster_id).result()
228+
return
229+
elif info.state in (state.PENDING, state.RESIZING, state.RESTARTING):
230+
self.wait_get_cluster_running(cluster_id)
231+
return
232+
elif info.state in (state.ERROR, state.UNKNOWN):
233+
raise RuntimeError(f'Cluster {info.cluster_name} is {info.state}: {info.state_message}')
234+
except OperationFailed as e:
235+
_LOG.debug('Operation failed, retrying', exc_info=e)
236+
raise TimeoutError(f'timed out after {timeout}')

0 commit comments

Comments
 (0)