Skip to content
This repository was archived by the owner on Sep 2, 2025. It is now read-only.

Commit 8225313

Browse files
authored
Add configuration options for enable_list_inference and intermediate_format for python models (#1205)
* add configuration options for enable_list_inference and intermediate_format for python models * update test classes so that they run as part of the python model tests in tox * reconfigure tests to force serial run instead of parallel
1 parent 32fe415 commit 8225313

File tree

7 files changed

+184
-0
lines changed

7 files changed

+184
-0
lines changed
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
kind: Features
2+
body: Add configuration options `enable_list_inference` and `intermediate_format` for python
3+
models
4+
time: 2024-04-26T10:53:19.874239-04:00
5+
custom:
6+
Author: mikealfare
7+
Issue: 1047 1114
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
kind: Fixes
2+
body: Default `enableListInference` to `True` for python models to support nested
3+
lists
4+
time: 2024-04-26T10:52:24.827314-04:00
5+
custom:
6+
Author: mikealfare
7+
Issue: 1047 1114

dbt/adapters/bigquery/impl.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,8 @@ class BigqueryConfig(AdapterConfig):
9999
enable_refresh: Optional[bool] = None
100100
refresh_interval_minutes: Optional[int] = None
101101
max_staleness: Optional[str] = None
102+
enable_list_inference: Optional[bool] = None
103+
intermediate_format: Optional[str] = None
102104

103105

104106
class BigQueryAdapter(BaseAdapter):

dbt/include/bigquery/macros/materializations/table.sql

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,12 +49,19 @@
4949
from pyspark.sql import SparkSession
5050
{%- set raw_partition_by = config.get('partition_by', none) -%}
5151
{%- set raw_cluster_by = config.get('cluster_by', none) -%}
52+
{%- set enable_list_inference = config.get('enable_list_inference', true) -%}
53+
{%- set intermediate_format = config.get('intermediate_format', none) -%}
54+
5255
{%- set partition_config = adapter.parse_partition_by(raw_partition_by) %}
5356

5457
spark = SparkSession.builder.appName('smallTest').getOrCreate()
5558

5659
spark.conf.set("viewsEnabled","true")
5760
spark.conf.set("temporaryGcsBucket","{{target.gcs_bucket}}")
61+
spark.conf.set("enableListInference", "{{ enable_list_inference }}")
62+
{% if intermediate_format %}
63+
spark.conf.set("intermediateFormat", "{{ intermediate_format }}")
64+
{% endif %}
5865

5966
{{ compiled_code }}
6067
dbt = dbtObj(spark.read.format("bigquery").load)

tests/functional/python_model_tests/__init__.py

Whitespace-only changes.
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
SINGLE_RECORD = """
2+
import pandas as pd
3+
4+
def model(dbt, session):
5+
6+
dbt.config(
7+
submission_method="serverless",
8+
materialized="table"
9+
)
10+
11+
df = pd.DataFrame(
12+
[
13+
{"column_name": {"name": "hello", "my_list": ["h", "e", "l", "l", "o"]}},
14+
]
15+
)
16+
17+
return df
18+
"""
19+
20+
21+
MULTI_RECORD = """
22+
import pandas as pd
23+
24+
def model(dbt, session):
25+
26+
dbt.config(
27+
submission_method="serverless",
28+
materialized="table",
29+
)
30+
31+
df = pd.DataFrame(
32+
[
33+
{"column_name": [{"name": "hello", "my_list": ["h", "e", "l", "l", "o"]}]},
34+
]
35+
)
36+
37+
return df
38+
"""
39+
40+
41+
ORC_FORMAT = """
42+
import pandas as pd
43+
44+
def model(dbt, session):
45+
46+
dbt.config(
47+
submission_method="serverless",
48+
materialized="table",
49+
intermediate_format="orc",
50+
)
51+
52+
df = pd.DataFrame(
53+
[
54+
{"column_name": [{"name": "hello", "my_list": ["h", "e", "l", "l", "o"]}]},
55+
]
56+
)
57+
58+
return df
59+
"""
60+
61+
62+
ENABLE_LIST_INFERENCE = """
63+
import pandas as pd
64+
65+
def model(dbt, session):
66+
67+
dbt.config(
68+
submission_method="serverless",
69+
materialized="table",
70+
enable_list_inference="true",
71+
)
72+
73+
df = pd.DataFrame(
74+
[
75+
{"column_name": [{"name": "hello", "my_list": ["h", "e", "l", "l", "o"]}]},
76+
]
77+
)
78+
79+
return df
80+
"""
81+
82+
83+
ENABLE_LIST_INFERENCE_PARQUET_FORMAT = """
84+
import pandas as pd
85+
86+
def model(dbt, session):
87+
88+
dbt.config(
89+
submission_method="serverless",
90+
materialized="table",
91+
enable_list_inference="true",
92+
intermediate_format="parquet",
93+
)
94+
95+
df = pd.DataFrame(
96+
[
97+
{"column_name": [{"name": "hello", "my_list": ["h", "e", "l", "l", "o"]}]},
98+
]
99+
)
100+
101+
return df
102+
"""
103+
104+
105+
DISABLE_LIST_INFERENCE_ORC_FORMAT = """
106+
import pandas as pd
107+
108+
def model(dbt, session):
109+
110+
dbt.config(
111+
submission_method="serverless",
112+
materialized="table",
113+
enable_list_inference="false",
114+
intermediate_format="orc",
115+
)
116+
117+
df = pd.DataFrame(
118+
[
119+
{"column_name": [{"name": "hello", "my_list": ["h", "e", "l", "l", "o"]}]},
120+
]
121+
)
122+
123+
return df
124+
125+
"""
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
"""
2+
This test case addresses this regression: https://github.com/dbt-labs/dbt-bigquery/issues/1047
3+
4+
As the comments point out, the issue appears when the settings are:
5+
- list inference: off
6+
- intermediate format: parquet
7+
8+
Adjusting either of these alleviates the issue.
9+
10+
When the regression was first reported, `files.MULTI_RECORD` failed while the other models passed.
11+
"""
12+
from dbt.tests.util import run_dbt_and_capture
13+
import pytest
14+
15+
from tests.functional.python_model_tests import files
16+
17+
18+
class TestListInference:
19+
@pytest.fixture(scope="class")
20+
def models(self):
21+
return {
22+
# this is what worked prior to this issue
23+
"single_record.py": files.SINGLE_RECORD,
24+
# this is the model that initially failed for this issue
25+
"multi_record.py": files.MULTI_RECORD,
26+
# these are explicit versions of the default settings
27+
"enable_list_inference.py": files.ENABLE_LIST_INFERENCE,
28+
"enable_list_inference_parquet_format.py": files.ENABLE_LIST_INFERENCE_PARQUET_FORMAT,
29+
# orc format also resolves the issue, regardless of list inference
30+
"orc_format.py": files.ORC_FORMAT,
31+
"disable_list_inference_orc_format.py": files.DISABLE_LIST_INFERENCE_ORC_FORMAT,
32+
}
33+
34+
def test_models_success(self, project, models):
35+
result, output = run_dbt_and_capture(["run"])
36+
assert len(result) == len(models)

0 commit comments

Comments
 (0)