Add --wait option to databricks runs submit CLI command (#487)

jerrylian-db · web-flow · commit 121c2a675059 · 2022-06-17T15:13:15.000+02:00
diff --git a/databricks_cli/runs/cli.py b/databricks_cli/runs/cli.py
@@ -21,13 +21,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import sys
+import time
+from json import loads as json_loads
+
 import click
 from tabulate import tabulate
 
 from databricks_cli.click_types import OutputClickType, JsonClickType, RunIdClickType
 from databricks_cli.jobs.cli import check_version
-from databricks_cli.utils import eat_exceptions, CONTEXT_SETTINGS, pretty_format, json_cli_base, \
-    truncate_string
+from databricks_cli.utils import eat_exceptions, CONTEXT_SETTINGS, pretty_format, truncate_string, \
+    error_and_quit, backoff_with_jitter
 from databricks_cli.configure.config import provide_api_client, profile_option, debug_option, \
     api_version_option
 from databricks_cli.runs.api import RunsApi
@@ -39,21 +43,48 @@
               help='File containing JSON request to POST to /api/2.*/jobs/runs/submit.')
 @click.option('--json', default=None, type=JsonClickType(),
               help=JsonClickType.help('/api/2.*/jobs/runs/submit'))
+@click.option('--wait', is_flag=True, default=False,
+              help='Waits for the submitted run to complete.')
 @api_version_option
 @debug_option
 @profile_option
 @eat_exceptions
 @provide_api_client
-def submit_cli(api_client, json_file, json, version):
+def submit_cli(api_client, json_file, json, wait, version):
     """
-    Submits a one-time run.
+    Submits a one-time run and optionally waits for its completion.
 
     The specification for the request json can be found
     https://docs.databricks.com/api/latest/jobs.html#runs-submit
     """
     check_version(api_client, version)
-    json_cli_base(json_file, json, lambda json: RunsApi(
-        api_client).submit_run(json, version=version))
+    if json_file:
+        with open(json_file, 'r') as f:
+            json = f.read()
+    submit_res = RunsApi(api_client).submit_run(json_loads(json), version=version)
+    click.echo(pretty_format(submit_res))
+    if wait:
+        run_id = submit_res['run_id']
+        completed_states = set(['TERMINATED', 'SKIPPED', 'INTERNAL_ERROR'])
+        prev_life_cycle_state = ""
+        attempt = 0
+        # Wait for run to complete
+        while True:
+            run = RunsApi(api_client).get_run(run_id, version=version)
+            run_state = run['state']
+            life_cycle_state = run_state['life_cycle_state']
+            if life_cycle_state in completed_states:
+                if run_state['result_state'] == 'SUCCESS':
+                    sys.exit(0)
+                else:
+                    error_and_quit('Run failed with state ' + run_state['result_state'] +
+                                   ' and state message ' + run_state['state_message'])
+            if prev_life_cycle_state != life_cycle_state:
+                click.echo('Waiting on run to complete. Current state: ' + life_cycle_state +
+                           '. URL: ' + run['run_page_url'], err=True)
+                prev_life_cycle_state = life_cycle_state
+            time.sleep(backoff_with_jitter(attempt))
+            attempt += 1
 
 
 def _runs_to_table(runs_json):
diff --git a/databricks_cli/utils.py b/databricks_cli/utils.py
@@ -21,6 +21,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import math
+import random
 import sys
 import traceback
 from json import dumps as json_dumps, loads as json_loads
@@ -98,6 +100,20 @@ def error_and_quit(message):
     sys.exit(1)
 
 
+INTERVAL_MAX = 30
+INTERVAL_BASE = 5
+MAX_EXPONENT = 10
+
+
+def backoff_with_jitter(attempt):
+    """
+    Creates a growing but randomized wait time based on the number of attempts already made.
+    """
+    exponent = min(attempt, MAX_EXPONENT)
+    sleep_time = min(INTERVAL_MAX, INTERVAL_BASE * 2 ** exponent)
+    return random.randrange(math.floor(sleep_time * 0.5), sleep_time)
+
+
 def pretty_format(json, encode_utf8=False):
     if encode_utf8:
         return json_dumps(json, indent=2, ensure_ascii=False)
diff --git a/tests/runs/test_cli.py b/tests/runs/test_cli.py
@@ -35,6 +35,25 @@
 
 SUBMIT_RETURN = {'run_id': 5}
 SUBMIT_JSON = '{"name": "test_run"}'
+RUNS_GET_RETURN_SUCCESS = {
+    "state": {
+        "life_cycle_state": "TERMINATED",
+        "result_state": "SUCCESS",
+    },
+}
+RUNS_GET_RETURN_PENDING = {
+    "state": {
+        "life_cycle_state": "PENDING",
+        "state_message": "Waiting for cluster",
+    },
+    "run_page_url": "https://www.google.com",
+}
+RUNS_GET_RETURN_RUNNING = {
+    "state": {
+        "life_cycle_state": "RUNNING",
+    },
+    "run_page_url": "https://www.google.com",
+}
 
 
 @pytest.fixture()
@@ -56,6 +75,44 @@ def test_submit_cli_json(runs_api_mock):
         assert echo_mock.call_args[0][0] == pretty_format(SUBMIT_RETURN)
 
 
+@provide_conf
+def test_submit_wait_success(runs_api_mock):
+    with mock.patch('databricks_cli.runs.cli.click.echo') as echo_mock, \
+         mock.patch('time.sleep') as sleep_mock:
+        runs_api_mock.submit_run.return_value = SUBMIT_RETURN
+        runs_api_mock.get_run.side_effect = [RUNS_GET_RETURN_PENDING, RUNS_GET_RETURN_PENDING, \
+                                             RUNS_GET_RETURN_RUNNING, RUNS_GET_RETURN_SUCCESS]
+        runner = CliRunner()
+        result = runner.invoke(cli.submit_cli, ['--json', SUBMIT_JSON, '--wait'])
+        assert runs_api_mock.get_run.call_count == 4
+        assert sleep_mock.call_count == 3
+        assert echo_mock.call_args[0][0] == 'Waiting on run to complete. Current state: ' + \
+                                            'RUNNING. URL: https://www.google.com'
+        assert result.exit_code == 0
+
+
+@pytest.mark.parametrize('bad_life_cycle_state', ['TERMINATED', 'SKIPPED', 'INTERNAL_ERROR'])
+@provide_conf
+def test_submit_wait_failure(runs_api_mock, bad_life_cycle_state):
+    with mock.patch('click.echo') as echo_mock, mock.patch('time.sleep') as sleep_mock:
+        runs_api_mock.submit_run.return_value = SUBMIT_RETURN
+        runs_get_failed = {
+            "state": {
+                "life_cycle_state": bad_life_cycle_state,
+                "result_state": "FAILED",
+                "state_message": "OH NO!",
+            },
+        }
+        runs_api_mock.get_run.side_effect = [RUNS_GET_RETURN_PENDING, RUNS_GET_RETURN_PENDING, \
+                                             RUNS_GET_RETURN_RUNNING, runs_get_failed]
+        runner = CliRunner()
+        result = runner.invoke(cli.submit_cli, ['--json', SUBMIT_JSON, '--wait'])
+        assert runs_api_mock.get_run.call_count == 4
+        assert sleep_mock.call_count == 3
+        assert 'Run failed with state FAILED and state message OH NO!' in echo_mock.call_args[0][0]
+        assert result.exit_code == 1
+
+
 RUN_PAGE_URL = 'https://databricks.com/#job/1/run/1'
 LIST_RETURN = {
     'runs': [{