Skip to content

Commit 05a0b8d

Browse files
committed
Merge branch 'main' into 1.10.latest
2 parents 5f686f2 + 64c6347 commit 05a0b8d

File tree

26 files changed

+424
-112
lines changed

26 files changed

+424
-112
lines changed

.github/workflows/integration.yml

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,20 @@
1+
# Integration Tests for dbt-databricks
2+
#
3+
# This workflow runs integration tests that require Databricks secrets.
4+
#
5+
# For testing external contributions (PRs from forks):
6+
# 1. Go to Actions tab -> Integration Tests -> Run workflow
7+
# 2. Enter the PR number in the 'pr_number' field
8+
# 3. Click "Run workflow"
9+
#
10+
# This approach is secure because:
11+
# - The workflow runs in the databricks repository context (access to secrets)
12+
# - The code to test is explicitly specified by maintainers
13+
# - No automatic execution of untrusted code with secrets
114
name: Integration Tests
215
on:
3-
push:
16+
pull_request:
17+
# Run on PRs to the same repository (internal contributors)
418
paths-ignore:
519
- "**.MD"
620
- "**.md"
@@ -9,6 +23,18 @@ on:
923
- ".github/workflows/main.yml"
1024
- ".github/workflows/stale.yml"
1125

26+
workflow_dispatch:
27+
# Manual triggering for external contributions and ad-hoc testing
28+
inputs:
29+
pr_number:
30+
description: "PR number to test (for external contributions)"
31+
required: false
32+
type: string
33+
git_ref:
34+
description: "Git ref (branch/tag/commit) to test"
35+
required: false
36+
type: string
37+
1238
concurrency:
1339
group: ${{ github.workflow }}-${{ github.ref }}
1440
cancel-in-progress: true
@@ -17,6 +43,8 @@ jobs:
1743
run-uc-cluster-e2e-tests:
1844
runs-on: ubuntu-latest
1945
environment: azure-prod
46+
# Only run on internal PRs or manual dispatch - skip external forks to avoid secret access failures
47+
if: github.event_name == 'workflow_dispatch' || github.event.pull_request.head.repo.full_name == github.repository
2048
env:
2149
DBT_DATABRICKS_HOST_NAME: ${{ secrets.DATABRICKS_HOST }}
2250
DBT_DATABRICKS_CLIENT_ID: ${{ secrets.TEST_PECO_SP_ID }}
@@ -27,6 +55,14 @@ jobs:
2755
steps:
2856
- name: Check out repository
2957
uses: actions/checkout@v4
58+
with:
59+
# For pull_request: checkout the PR head commit
60+
# For workflow_dispatch with pr_number: checkout that PR's head
61+
# For workflow_dispatch with git_ref: checkout that ref
62+
# Otherwise: checkout current branch
63+
ref: ${{ github.event.pull_request.head.sha || (github.event.inputs.pr_number && format('refs/pull/{0}/head', github.event.inputs.pr_number)) || github.event.inputs.git_ref || github.ref }}
64+
# Fetch enough history for PR testing
65+
fetch-depth: 0
3066

3167
- name: Set up python
3268
id: setup-python
@@ -56,6 +92,8 @@ jobs:
5692
run-sqlwarehouse-e2e-tests:
5793
runs-on: ubuntu-latest
5894
environment: azure-prod
95+
# Only run on internal PRs or manual dispatch - skip external forks to avoid secret access failures
96+
if: github.event_name == 'workflow_dispatch' || github.event.pull_request.head.repo.full_name == github.repository
5997
env:
6098
DBT_DATABRICKS_HOST_NAME: ${{ secrets.DATABRICKS_HOST }}
6199
DBT_DATABRICKS_CLIENT_ID: ${{ secrets.TEST_PECO_SP_ID }}
@@ -67,6 +105,14 @@ jobs:
67105
steps:
68106
- name: Check out repository
69107
uses: actions/checkout@v4
108+
with:
109+
# For pull_request: checkout the PR head commit
110+
# For workflow_dispatch with pr_number: checkout that PR's head
111+
# For workflow_dispatch with git_ref: checkout that ref
112+
# Otherwise: checkout current branch
113+
ref: ${{ github.event.pull_request.head.sha || (github.event.inputs.pr_number && format('refs/pull/{0}/head', github.event.inputs.pr_number)) || github.event.inputs.git_ref || github.ref }}
114+
# Fetch enough history for PR testing
115+
fetch-depth: 0
70116

71117
- name: Set up python
72118
id: setup-python
@@ -96,6 +142,8 @@ jobs:
96142
run-cluster-e2e-tests:
97143
runs-on: ubuntu-latest
98144
environment: azure-prod
145+
# Only run on internal PRs or manual dispatch - skip external forks to avoid secret access failures
146+
if: github.event_name == 'workflow_dispatch' || github.event.pull_request.head.repo.full_name == github.repository
99147
env:
100148
DBT_DATABRICKS_HOST_NAME: ${{ secrets.DATABRICKS_HOST }}
101149
DBT_DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }}
@@ -104,6 +152,14 @@ jobs:
104152
steps:
105153
- name: Check out repository
106154
uses: actions/checkout@v4
155+
with:
156+
# For pull_request: checkout the PR head commit
157+
# For workflow_dispatch with pr_number: checkout that PR's head
158+
# For workflow_dispatch with git_ref: checkout that ref
159+
# Otherwise: checkout current branch
160+
ref: ${{ github.event.pull_request.head.sha || (github.event.inputs.pr_number && format('refs/pull/{0}/head', github.event.inputs.pr_number)) || github.event.inputs.git_ref || github.ref }}
161+
# Fetch enough history for PR testing
162+
fetch-depth: 0
107163

108164
- name: Set up python
109165
id: setup-python

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,5 @@ logs/
2121
.python-version
2222
.hatch
2323
.coverage*
24+
CLAUDE.md
25+
.claude/

CHANGELOG.md

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,23 @@
1-
## dbt-databricks 1.10.11 (TBD)
1+
## dbt-databricks 1.10.13 (TBD)
2+
3+
## dbt-databricks 1.10.12 (September 8, 2025)
4+
5+
### Under the hood
6+
7+
- Update dependency versions ([1178](https://github.com/databricks/dbt-databricks/pull/1178))
8+
9+
## dbt-databricks 1.10.11 (September 2, 2025)
10+
11+
### Fixes
12+
13+
- Improve ANSI mode error handling for Python models and add debug instrumentation ([1157](https://github.com/databricks/dbt-databricks/pull/1157))
14+
- Remove external path on intermediate tables for incremental models (with Materialization V2) ([1161](https://github.com/databricks/dbt-databricks/pull/1161))
15+
- Fix get_columns_in_relation branching logic for streaming tables to prevent it from running `AS JSON`
16+
- Fix model-level compute override connection logic that was causing invalid spark configs to be set on SQL warehouses
17+
18+
### Under the hood
19+
20+
- Improve performance of schema enumeration/validation ([1168](https://github.com/databricks/dbt-databricks/pull/1168))
221

322
## dbt-databricks 1.10.10 (August 20, 2025)
423

README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,3 +99,20 @@ def model(dbt, session):
9999
http_path="sql/protocolv1/..."
100100
)
101101
```
102+
103+
## Python models and ANSI mode
104+
105+
When ANSI mode is enabled (`spark.sql.ansi.enabled=true`), there are limitations when using pandas DataFrames in Python models:
106+
107+
1. **Regular pandas DataFrames**: dbt-databricks will automatically handle conversion even when ANSI mode is enabled, falling back to `spark.createDataFrame()` if needed.
108+
109+
2. **pandas-on-Spark DataFrames**: If you create pandas-on-Spark DataFrames directly in your model (using `pyspark.pandas` or `databricks.koalas`), you may encounter errors with ANSI mode enabled. In this case, you have two options:
110+
- Disable ANSI mode for your session: Set `spark.sql.ansi.enabled=false` in your cluster or SQL warehouse configuration
111+
- Set the pandas-on-Spark option in your model code:
112+
```python
113+
import pyspark.pandas as ps
114+
ps.set_option('compute.fail_on_ansi_mode', False)
115+
```
116+
Note: This may cause unexpected behavior as pandas-on-Spark follows pandas semantics (returning null/NaN for invalid operations) rather than ANSI SQL semantics (raising errors).
117+
118+
For more information about ANSI mode and its implications, see the [Spark documentation on ANSI compliance](https://spark.apache.org/docs/latest/sql-ref-ansi-compliance.html).
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
version = "1.10.10"
1+
version = "1.10.12"

dbt/adapters/databricks/api_client.py

Lines changed: 42 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -452,31 +452,68 @@ def _get_exception(self, response: Response) -> None:
452452
result_state = state.get("result_state")
453453
life_cycle_state = state["life_cycle_state"]
454454

455+
# Add detailed logging for debugging
456+
logger.debug(f"[Python Model Debug] Full response state: {state}")
457+
logger.debug(f"[Python Model Debug] Life cycle state: {life_cycle_state}")
458+
logger.debug(f"[Python Model Debug] Result state: {result_state}")
459+
455460
if result_state == "CANCELED":
456461
raise DbtRuntimeError(f"Python model run ended in result_state {result_state}")
457462

458463
if life_cycle_state != "TERMINATED":
459464
try:
465+
# Log task information for debugging
466+
tasks = response_json.get("tasks", [])
467+
logger.debug(f"[Python Model Debug] Tasks in response: {len(tasks)}")
468+
for i, task in enumerate(tasks):
469+
logger.debug(f"[Python Model Debug] Task {i}: {task}")
470+
460471
task_id = response_json["tasks"][0]["run_id"]
472+
logger.debug(f"[Python Model Debug] Getting output for task_id: {task_id}")
473+
461474
# get end state to return to user
462475
run_output = self.session.get("/get-output", params={"run_id": task_id})
463476
json_run_output = run_output.json()
477+
478+
# Log the full output for debugging
479+
logger.debug(f"[Python Model Debug] Run output status: {run_output.status_code}")
480+
logger.debug(
481+
f"[Python Model Debug] Run output keys: {list(json_run_output.keys())}"
482+
)
483+
484+
# Extract more detailed error information
485+
error_msg = json_run_output.get("error", "No error message available")
486+
error_trace = utils.remove_ansi(json_run_output.get("error_trace", ""))
487+
488+
# Check for specific Python model issues
489+
if "error_trace" in json_run_output:
490+
logger.debug(f"[Python Model Debug] Error trace found: {error_trace[:500]}...")
491+
492+
# Include run ID and task information in error
493+
run_id = response_json.get("run_id")
464494
raise DbtRuntimeError(
465-
"Python model failed with traceback as:\n"
495+
f"Python model failed (run_id: {run_id}, task_id: {task_id})\n"
496+
"Traceback:\n"
466497
"(Note that the line number here does not "
467498
"match the line number in your code due to dbt templating)\n"
468-
f"{json_run_output['error']}\n"
469-
f"{utils.remove_ansi(json_run_output.get('error_trace', ''))}"
499+
f"{error_msg}\n"
500+
f"{error_trace}"
470501
)
471502

472503
except Exception as e:
473504
if isinstance(e, DbtRuntimeError):
474505
raise e
475506
else:
507+
# Log the exception for debugging
508+
logger.debug(f"[Python Model Debug] Exception during error extraction: {e}")
476509
state_message = response.json()["state"]["state_message"]
510+
511+
# Include more context in error
477512
raise DbtRuntimeError(
478-
f"Python model run ended in state {life_cycle_state}"
479-
f"with state_message\n{state_message}"
513+
f"Python model run ended in state {life_cycle_state} "
514+
f"(run_id: {response_json.get('run_id')})\n"
515+
f"State message: {state_message}\n"
516+
f"Result state: {result_state}"
480517
)
481518

482519
def cancel(self, run_id: str) -> None:

dbt/adapters/databricks/catalogs/_unity.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ def build_relation(self, model: RelationConfig) -> DatabricksCatalogRelation:
3434
Args:
3535
model: `config.model` (not `model`) from the jinja context
3636
"""
37+
3738
return DatabricksCatalogRelation(
3839
catalog_type=self.catalog_type,
3940
catalog_name=self.catalog_name

dbt/adapters/databricks/connections.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
from dbt.adapters.databricks.handle import CursorWrapper, DatabricksHandle, SqlUtils
3737
from dbt.adapters.databricks.logging import logger
3838
from dbt.adapters.databricks.python_models.run_tracking import PythonRunTracker
39-
from dbt.adapters.databricks.utils import redact_credentials
39+
from dbt.adapters.databricks.utils import is_cluster_http_path, redact_credentials
4040
from dbt.adapters.events.types import (
4141
ConnectionClosedInCleanup,
4242
ConnectionReused,
@@ -130,12 +130,8 @@ def api_client(self) -> DatabricksApiClient:
130130

131131
def is_cluster(self) -> bool:
132132
conn = self.get_thread_connection()
133-
return (
134-
conn.credentials.cluster_id is not None
135-
# Credentials field is not updated when overriding the compute at model level.
136-
# This secondary check is a workaround for that case
137-
or "/warehouses/" not in cast(DatabricksDBTConnection, conn).http_path
138-
)
133+
databricks_conn = cast(DatabricksDBTConnection, conn)
134+
return is_cluster_http_path(databricks_conn.http_path, conn.credentials.cluster_id)
139135

140136
def cancel_open(self) -> list[str]:
141137
cancelled = super().cancel_open()
@@ -402,7 +398,8 @@ def connect() -> DatabricksHandle:
402398
try:
403399
# TODO: what is the error when a user specifies a catalog they don't have access to
404400
conn = DatabricksHandle.from_connection_args(
405-
conn_args, creds.cluster_id is not None
401+
conn_args,
402+
is_cluster_http_path(databricks_connection.http_path, creds.cluster_id),
406403
)
407404
if conn:
408405
databricks_connection.session_id = conn.session_id

dbt/adapters/databricks/impl.py

Lines changed: 7 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -300,21 +300,14 @@ def compare_dbr_version(self, major: int, minor: int) -> int:
300300
return self.connections.compare_dbr_version(major, minor)
301301

302302
def list_schemas(self, database: Optional[str]) -> list[str]:
303-
"""
304-
Get a list of existing schemas in database.
305-
306-
If `database` is `None`, fallback to executing `show databases` because
307-
`list_schemas` tries to collect schemas from all catalogs when `database` is `None`.
308-
"""
309-
if database is not None:
310-
results = self.connections.list_schemas(database=database)
311-
else:
312-
results = self.execute_macro(LIST_SCHEMAS_MACRO_NAME, kwargs={"database": database})
303+
results = self.execute_macro(LIST_SCHEMAS_MACRO_NAME, kwargs={"database": database})
313304
return [row[0] for row in results]
314305

315306
def check_schema_exists(self, database: Optional[str], schema: str) -> bool:
316307
"""Check if a schema exists."""
317-
return schema.lower() in set(s.lower() for s in self.list_schemas(database=database))
308+
return schema.lower() in set(
309+
s.lower() for s in self.connections.list_schemas(database or "hive_metastore", schema)
310+
)
318311

319312
def execute(
320313
self,
@@ -485,10 +478,9 @@ def get_columns_in_relation( # type: ignore[override]
485478
relation.is_hive_metastore()
486479
or self.compare_dbr_version(16, 2) < 0
487480
or relation.type == DatabricksRelationType.MaterializedView
488-
or (
489-
relation.type == DatabricksRelationType.StreamingTable
490-
and self.compare_dbr_version(17, 1) < 0
491-
)
481+
# TODO: Replace with self.compare_dbr_version(17, 1) < 0 when 17.1 is current version
482+
# for SQL warehouses
483+
or relation.type == DatabricksRelationType.StreamingTable
492484
)
493485
return self.get_column_behavior.get_columns_in_relation(self, relation, use_legacy_logic)
494486

dbt/adapters/databricks/parse_model.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,8 @@ def table_format(model: RelationConfig) -> Optional[str]:
4040
def _get(
4141
model: RelationConfig, setting: str, case_sensitive: Optional[bool] = False
4242
) -> Optional[str]:
43-
if not model.config:
43+
# dbt-core can sometimes pass in non-model configs that don't have "get" defined
44+
if not model.config or not hasattr(model.config, "get"):
4445
return None
4546

4647
if value := model.config.get(setting):

0 commit comments

Comments
 (0)