fix: Python models over command execution api timeout issue (#1243)

benc-db · web-flow · commit 4f347de9ceeb · 2025-10-30T10:26:00.000-07:00
### Description

Command execution api was timing out automatically at 20 minutes due to
defaults with wait. My intent was to grab the command id as soon as it
was available and poll, so that we would have the capability to cancel
if the user killed dbt. Switching back to that approach.

### Checklist

- [x] I have run this code in development and it appears to resolve the
stated issue
- [x] This PR includes tests, or tests are not required/relevant for
this PR
- [ ] I have updated the `CHANGELOG.md` and added information about my
change to the "dbt-databricks next" section.
diff --git a/dbt/adapters/databricks/api_client.py b/dbt/adapters/databricks/api_client.py
@@ -1,4 +1,5 @@
 import base64
+import random
 import re
 import time
 from abc import ABC, abstractmethod
@@ -140,7 +141,10 @@ def _handle_start_cluster_error(self, cluster_id: str, error: Exception) -> None
 
 class CommandContextApi:
     def __init__(
-        self, workspace_client: WorkspaceClient, cluster_api: ClusterApi, library_api: LibraryApi
+        self,
+        workspace_client: WorkspaceClient,
+        cluster_api: ClusterApi,
+        library_api: LibraryApi,
     ):
         self.workspace_client = workspace_client
         self.cluster_api = cluster_api
@@ -162,21 +166,92 @@ def _ensure_cluster_ready(self, cluster_id: str) -> None:
         ):
             self.cluster_api.wait_for_cluster(cluster_id)
 
-    def _create_execution_context(self, cluster_id: str) -> str:
-        try:
-            result = self.workspace_client.command_execution.create(
-                cluster_id=cluster_id,
-                language=ComputeLanguage.PYTHON,
-            )
+    def _create_execution_context(self, cluster_id: str, max_retries: int = 5) -> str:
+        """Create execution context with retry logic for transient failures.
 
-            context_response = result.result()
-            context_id = context_response.id
-            if context_id is None:
-                raise DbtRuntimeError("Failed to create execution context: no context ID returned")
-            logger.info(f"Created execution context with id={context_id}")
-            return context_id
-        except Exception as e:
-            raise DbtRuntimeError(f"Error creating an execution context.\n {e}")
+        Args:
+            cluster_id: The cluster ID to create the context on
+            max_retries: Maximum number of retry attempts (default: 5)
+
+        Returns:
+            The execution context ID
+
+        Raises:
+            DbtRuntimeError: If context creation fails after all retries
+        """
+        last_error = None
+        for attempt in range(max_retries):
+            context_id = None
+            try:
+                # Use SDK to create execution context - returns a Wait object
+                # The Wait object provides context_id immediately, but we need to call result()
+                # to wait for the context to reach RUNNING state
+                waiter = self.workspace_client.command_execution.create(
+                    cluster_id=cluster_id,
+                    language=ComputeLanguage.PYTHON,
+                )
+
+                # Get context_id immediately (available before waiting)
+                context_id = waiter.context_id
+                if context_id is None:
+                    raise DbtRuntimeError(
+                        "Failed to create execution context: no context ID returned"
+                    )
+
+                logger.debug(f"Execution context {context_id} created, waiting for RUNNING state")
+
+                # Now wait for the context to reach RUNNING state
+                # This is where it may fail with ContextStatus.ERROR
+                waiter.result()
+
+                logger.info(f"Execution context {context_id} reached RUNNING state")
+                return context_id
+            except Exception as e:
+                last_error = e
+                error_msg = str(e).lower()
+
+                # Log full exception details for debugging
+                logger.debug(
+                    f"Execution context {context_id or 'unknown'} creation exception: "
+                    f"type={type(e).__name__}, message={e}"
+                )
+
+                # Retry on transient errors (resource contention, temporary failures)
+                # ContextStatus.ERROR can occur when cluster is under heavy load
+                if "contextstatus.error" in error_msg or "failed to reach running" in error_msg:
+                    if attempt < max_retries - 1:
+                        # If we have a context_id, try to destroy it before retrying
+                        if context_id:
+                            try:
+                                logger.debug(f"Destroying failed context {context_id}")
+                                self.workspace_client.command_execution.destroy(
+                                    cluster_id=cluster_id, context_id=context_id
+                                )
+                            except Exception as cleanup_error:
+                                logger.debug(
+                                    f"Failed to destroy context {context_id}: {cleanup_error}"
+                                )
+
+                        # Exponential backoff with jitter: base 2^attempt + random 0-1s
+                        # This helps prevent thundering herd when many contexts retry at once
+                        base_wait = 2**attempt  # 1s, 2s, 4s, 8s, 16s
+                        jitter = random.random()  # 0-1 second
+                        wait_time = base_wait + jitter
+                        logger.warning(
+                            f"Execution context creation failed "
+                            f"(attempt {attempt + 1}/{max_retries}), "
+                            f"retrying in {wait_time:.1f}s: {e}"
+                        )
+                        time.sleep(wait_time)
+                        continue
+
+                # Non-retryable error or final attempt - raise immediately
+                raise DbtRuntimeError(f"Error creating an execution context.\n {e}")
+
+        # If we exhausted all retries
+        raise DbtRuntimeError(
+            f"Error creating an execution context after {max_retries} attempts.\n {last_error}"
+        )
 
     def destroy(self, cluster_id: str, context_id: str) -> None:
         try:
@@ -300,15 +375,20 @@ def __init__(self, workspace_client: WorkspaceClient, polling_interval: int, tim
 
     def execute(self, cluster_id: str, context_id: str, command: str) -> CommandExecution:
         try:
-            # Use SDK to execute command
-            result = self.workspace_client.command_execution.execute(
+            # Use SDK to execute command - returns a Wait object immediately
+            # The command_id is available via __getattr__ without calling result()
+            # We don't call result() because that would wait for execution to finish,
+            # and we want to use our own timeout via poll_for_completion()
+            waiter = self.workspace_client.command_execution.execute(
                 cluster_id=cluster_id,
                 context_id=context_id,
                 language=ComputeLanguage.PYTHON,  # SUBMISSION_LANGUAGE was "python"
                 command=command,
             )
 
-            command_id = result.result().id
+            # Extract command_id from the waiter without blocking
+            # The SDK provides this immediately in the kwargs
+            command_id = waiter.command_id
             if command_id is None:
                 raise DbtRuntimeError("Failed to execute command: no command ID returned")
             logger.debug(f"Command executed with id={command_id}")
diff --git a/pyproject.toml b/pyproject.toml
@@ -88,10 +88,10 @@ python = "3.10"
 [tool.hatch.envs.default.scripts]
 setup-precommit = "pre-commit install"
 code-quality = "pre-commit run --all-files"
-unit = "pytest --color=yes -v --profile databricks_cluster -n auto tests/unit"
-cluster-e2e = "pytest --color=yes -v --profile databricks_cluster -n auto --dist=loadfile tests/functional"
-uc-cluster-e2e = "pytest --color=yes -v --profile databricks_uc_cluster -n auto --dist=loadfile tests/functional"
-sqlw-e2e = "pytest --color=yes -v --profile databricks_uc_sql_endpoint -n auto --dist=loadfile tests/functional"
+unit = "pytest --color=yes -v --profile databricks_cluster -n 10 tests/unit"
+cluster-e2e = "pytest --color=yes -v --profile databricks_cluster -n 10 --dist=loadfile tests/functional"
+uc-cluster-e2e = "pytest --color=yes -v --profile databricks_uc_cluster -n 10 --dist=loadfile tests/functional"
+sqlw-e2e = "pytest --color=yes -v --profile databricks_uc_sql_endpoint -n 10 --dist=loadfile tests/functional"
 
 [tool.hatch.envs.test.scripts]
 unit = "pytest --color=yes -v --profile databricks_cluster -n 10 --dist=loadscope tests/unit"
diff --git a/tests/functional/adapter/python_model/fixtures.py b/tests/functional/adapter/python_model/fixtures.py
@@ -314,3 +314,26 @@ def model(dbt, spark):
     data = [[1,2]] * 10
     return spark.createDataFrame(data, schema=['test', 'test2'])
 """
+
+all_purpose_command_api_schema = """version: 2
+
+models:
+  - name: my_versioned_sql_model
+    versions:
+      - v: 1
+  - name: my_python_model
+    # No submission_method or create_notebook config here
+    # Will use project-level config (all_purpose_cluster with create_notebook=False)
+
+sources:
+  - name: test_source
+    loader: custom
+    schema: "{{ var(env_var('DBT_TEST_SCHEMA_NAME_VARIABLE')) }}"
+    quoting:
+      identifier: True
+    tags:
+      - my_test_source_tag
+    tables:
+      - name: test_table
+        identifier: source
+"""
diff --git a/tests/functional/adapter/python_model/test_python_model.py b/tests/functional/adapter/python_model/test_python_model.py
@@ -458,3 +458,32 @@ def test_changing_unique_tmp_table_suffix(self, project):
         )
         util.run_dbt(["run"])
         verify_temp_tables_cleaned(project)
+
+
+@pytest.mark.python
+@pytest.mark.skip_profile("databricks_uc_sql_endpoint")
+class TestAllPurposeClusterCommandAPI(BasePythonModelTests):
+    """Test Python models using all_purpose_cluster with Command API (create_notebook=False).
+
+    This tests the command execution path that uses the Command API directly
+    without creating notebooks, which exercises the timeout fix for command execution.
+    """
+
+    @pytest.fixture(scope="class")
+    def models(self):
+        return {
+            "schema.yml": override_fixtures.all_purpose_command_api_schema,
+            "my_sql_model.sql": fixtures.basic_sql,
+            "my_versioned_sql_model_v1.sql": fixtures.basic_sql,
+            "my_python_model.py": fixtures.basic_python,
+            "second_sql_model.sql": fixtures.second_sql,
+        }
+
+    @pytest.fixture(scope="class")
+    def project_config_update(self):
+        return {
+            "models": {
+                "+submission_method": "all_purpose_cluster",
+                "+create_notebook": False,  # Use Command API, not notebook submission
+            }
+        }
diff --git a/tests/unit/api_client/test_command_api.py b/tests/unit/api_client/test_command_api.py
@@ -33,9 +33,11 @@ def test_execute__exception(self, api, workspace_client):
         assert "Error creating a command" in str(exc_info.value)
 
     def test_execute__success(self, api, workspace_client, execution):
-        mock_result = Mock()
-        mock_result.result.return_value.id = "command_id"
-        workspace_client.command_execution.execute.return_value = mock_result
+        # Mock the Wait object returned by execute()
+        # The command_id is available immediately via __getattr__, not via result()
+        mock_waiter = Mock()
+        mock_waiter.command_id = "command_id"
+        workspace_client.command_execution.execute.return_value = mock_waiter
 
         result = api.execute("cluster_id", "context_id", "command")
 
@@ -46,6 +48,8 @@ def test_execute__success(self, api, workspace_client, execution):
             command="command",
             language=ComputeLanguage.PYTHON,
         )
+        # result() should NOT be called - we access command_id directly
+        mock_waiter.result.assert_not_called()
 
     def test_cancel__exception(self, api, workspace_client):
         workspace_client.command_execution.cancel.side_effect = Exception("API Error")
diff --git a/tests/unit/api_client/test_command_context_api.py b/tests/unit/api_client/test_command_context_api.py
@@ -1,7 +1,6 @@
 from unittest.mock import Mock
 
 import pytest
-from databricks.sdk.service.compute import ContextStatusResponse
 from dbt_common.exceptions import DbtRuntimeError
 
 from dbt.adapters.databricks.api_client import CommandContextApi
@@ -38,11 +37,14 @@ def test_create__cluster_running(self, api, cluster_api, library_api, workspace_
         cluster_api.status.return_value = "RUNNING"
         library_api.all_libraries_installed.return_value = True
 
-        mock_result = Mock()
-        mock_context_response = Mock(spec=ContextStatusResponse)
-        mock_context_response.id = "context_id"
-        mock_result.result.return_value = mock_context_response
-        workspace_client.command_execution.create.return_value = mock_result
+        # Mock the Wait object returned by create()
+        # The Wait object has context_id immediately available and result() waits for RUNNING
+        mock_waiter = Mock()
+        mock_waiter.context_id = "context_id"
+        mock_response = Mock()
+        mock_response.id = "context_id"
+        mock_waiter.result.return_value = mock_response
+        workspace_client.command_execution.create.return_value = mock_waiter
 
         context_id = api.create("cluster_id")
 
@@ -56,11 +58,14 @@ def test_create__cluster_running_with_pending_libraries(
         cluster_api.status.return_value = "RUNNING"
         library_api.all_libraries_installed.return_value = False
 
-        mock_result = Mock()
-        mock_context_response = Mock(spec=ContextStatusResponse)
-        mock_context_response.id = "context_id"
-        mock_result.result.return_value = mock_context_response
-        workspace_client.command_execution.create.return_value = mock_result
+        # Mock the Wait object returned by create()
+        # The Wait object has context_id immediately available and result() waits for RUNNING
+        mock_waiter = Mock()
+        mock_waiter.context_id = "context_id"
+        mock_response = Mock()
+        mock_response.id = "context_id"
+        mock_waiter.result.return_value = mock_response
+        workspace_client.command_execution.create.return_value = mock_waiter
 
         context_id = api.create("cluster_id")
 
@@ -71,11 +76,14 @@ def test_create__cluster_running_with_pending_libraries(
     def test_create__cluster_terminated(self, api, cluster_api, workspace_client):
         cluster_api.status.return_value = "TERMINATED"
 
-        mock_result = Mock()
-        mock_context_response = Mock(spec=ContextStatusResponse)
-        mock_context_response.id = "context_id"
-        mock_result.result.return_value = mock_context_response
-        workspace_client.command_execution.create.return_value = mock_result
+        # Mock the Wait object returned by create()
+        # The Wait object has context_id immediately available and result() waits for RUNNING
+        mock_waiter = Mock()
+        mock_waiter.context_id = "context_id"
+        mock_response = Mock()
+        mock_response.id = "context_id"
+        mock_waiter.result.return_value = mock_response
+        workspace_client.command_execution.create.return_value = mock_waiter
 
         api.create("cluster_id")
 
@@ -84,11 +92,14 @@ def test_create__cluster_terminated(self, api, cluster_api, workspace_client):
     def test_create__cluster_pending(self, api, cluster_api, workspace_client):
         cluster_api.status.return_value = "PENDING"
 
-        mock_result = Mock()
-        mock_context_response = Mock(spec=ContextStatusResponse)
-        mock_context_response.id = "context_id"
-        mock_result.result.return_value = mock_context_response
-        workspace_client.command_execution.create.return_value = mock_result
+        # Mock the Wait object returned by create()
+        # The Wait object has context_id immediately available and result() waits for RUNNING
+        mock_waiter = Mock()
+        mock_waiter.context_id = "context_id"
+        mock_response = Mock()
+        mock_response.id = "context_id"
+        mock_waiter.result.return_value = mock_response
+        workspace_client.command_execution.create.return_value = mock_waiter
 
         api.create("cluster_id")