Add configuration options for enable_list_inference and intermediate_format for python models (#1205)

mikealfare · web-flow · commit 82253130acf1 · 2024-05-07T17:05:19.000-04:00
* add configuration options for enable_list_inference and intermediate_format for python models
* update test classes so that they run as part of the python model tests in tox
* reconfigure tests to force serial run instead of parallel
diff --git a/.changes/unreleased/Features-20240426-105319.yaml b/.changes/unreleased/Features-20240426-105319.yaml
@@ -0,0 +1,7 @@
+kind: Features
+body: Add configuration options `enable_list_inference` and `intermediate_format` for python
+  models
+time: 2024-04-26T10:53:19.874239-04:00
+custom:
+  Author: mikealfare
+  Issue: 1047 1114
diff --git a/.changes/unreleased/Fixes-20240426-105224.yaml b/.changes/unreleased/Fixes-20240426-105224.yaml
@@ -0,0 +1,7 @@
+kind: Fixes
+body: Default `enableListInference` to `True` for python models to support nested
+  lists
+time: 2024-04-26T10:52:24.827314-04:00
+custom:
+  Author: mikealfare
+  Issue: 1047 1114
diff --git a/dbt/adapters/bigquery/impl.py b/dbt/adapters/bigquery/impl.py
@@ -99,6 +99,8 @@ class BigqueryConfig(AdapterConfig):
     enable_refresh: Optional[bool] = None
     refresh_interval_minutes: Optional[int] = None
     max_staleness: Optional[str] = None
+    enable_list_inference: Optional[bool] = None
+    intermediate_format: Optional[str] = None
 
 
 class BigQueryAdapter(BaseAdapter):
diff --git a/dbt/include/bigquery/macros/materializations/table.sql b/dbt/include/bigquery/macros/materializations/table.sql
@@ -49,12 +49,19 @@
 from pyspark.sql import SparkSession
 {%- set raw_partition_by = config.get('partition_by', none) -%}
 {%- set raw_cluster_by = config.get('cluster_by', none) -%}
+{%- set enable_list_inference = config.get('enable_list_inference', true) -%}
+{%- set intermediate_format = config.get('intermediate_format', none) -%}
+
 {%- set partition_config = adapter.parse_partition_by(raw_partition_by) %}
 
 spark = SparkSession.builder.appName('smallTest').getOrCreate()
 
 spark.conf.set("viewsEnabled","true")
 spark.conf.set("temporaryGcsBucket","{{target.gcs_bucket}}")
+spark.conf.set("enableListInference", "{{ enable_list_inference }}")
+{% if intermediate_format %}
+spark.conf.set("intermediateFormat", "{{ intermediate_format }}")
+{% endif %}
 
 {{ compiled_code }}
 dbt = dbtObj(spark.read.format("bigquery").load)
diff --git a/tests/functional/python_model_tests/__init__.py b/tests/functional/python_model_tests/__init__.py
diff --git a/tests/functional/python_model_tests/files.py b/tests/functional/python_model_tests/files.py
@@ -0,0 +1,125 @@
+SINGLE_RECORD = """
+import pandas as pd
+
+def model(dbt, session):
+
+    dbt.config(
+        submission_method="serverless",
+        materialized="table"
+    )
+
+    df = pd.DataFrame(
+        [
+            {"column_name": {"name": "hello", "my_list": ["h", "e", "l", "l", "o"]}},
+        ]
+    )
+
+    return df
+"""
+
+
+MULTI_RECORD = """
+import pandas as pd
+
+def model(dbt, session):
+
+    dbt.config(
+        submission_method="serverless",
+        materialized="table",
+    )
+
+    df = pd.DataFrame(
+        [
+            {"column_name": [{"name": "hello", "my_list": ["h", "e", "l", "l", "o"]}]},
+        ]
+    )
+
+    return df
+"""
+
+
+ORC_FORMAT = """
+import pandas as pd
+
+def model(dbt, session):
+
+    dbt.config(
+        submission_method="serverless",
+        materialized="table",
+        intermediate_format="orc",
+    )
+
+    df = pd.DataFrame(
+        [
+            {"column_name": [{"name": "hello", "my_list": ["h", "e", "l", "l", "o"]}]},
+        ]
+    )
+
+    return df
+"""
+
+
+ENABLE_LIST_INFERENCE = """
+import pandas as pd
+
+def model(dbt, session):
+
+    dbt.config(
+        submission_method="serverless",
+        materialized="table",
+        enable_list_inference="true",
+    )
+
+    df = pd.DataFrame(
+        [
+            {"column_name": [{"name": "hello", "my_list": ["h", "e", "l", "l", "o"]}]},
+        ]
+    )
+
+    return df
+"""
+
+
+ENABLE_LIST_INFERENCE_PARQUET_FORMAT = """
+import pandas as pd
+
+def model(dbt, session):
+
+    dbt.config(
+        submission_method="serverless",
+        materialized="table",
+        enable_list_inference="true",
+        intermediate_format="parquet",
+    )
+
+    df = pd.DataFrame(
+        [
+            {"column_name": [{"name": "hello", "my_list": ["h", "e", "l", "l", "o"]}]},
+        ]
+    )
+
+    return df
+"""
+
+
+DISABLE_LIST_INFERENCE_ORC_FORMAT = """
+import pandas as pd
+
+def model(dbt, session):
+
+    dbt.config(
+        submission_method="serverless",
+        materialized="table",
+        enable_list_inference="false",
+        intermediate_format="orc",
+    )
+
+    df = pd.DataFrame(
+        [
+            {"column_name": [{"name": "hello", "my_list": ["h", "e", "l", "l", "o"]}]},
+        ]
+    )
+
+    return df
+
+"""
diff --git a/tests/functional/python_model_tests/test_list_inference.py b/tests/functional/python_model_tests/test_list_inference.py
@@ -0,0 +1,36 @@
+"""
+This test case addresses this regression: https://github.com/dbt-labs/dbt-bigquery/issues/1047
+
+As the comments point out, the issue appears when the settings are:
+    - list inference: off
+    - intermediate format: parquet
+
+Adjusting either of these alleviates the issue.
+
+When the regression was first reported, `files.MULTI_RECORD` failed while the other models passed.
+"""
+from dbt.tests.util import run_dbt_and_capture
+import pytest
+
+from tests.functional.python_model_tests import files
+
+
+class TestListInference:
+    @pytest.fixture(scope="class")
+    def models(self):
+        return {
+            # this is what worked prior to this issue
+            "single_record.py": files.SINGLE_RECORD,
+            # this is the model that initially failed for this issue
+            "multi_record.py": files.MULTI_RECORD,
+            # these are explicit versions of the default settings
+            "enable_list_inference.py": files.ENABLE_LIST_INFERENCE,
+            "enable_list_inference_parquet_format.py": files.ENABLE_LIST_INFERENCE_PARQUET_FORMAT,
+            # orc format also resolves the issue, regardless of list inference
+            "orc_format.py": files.ORC_FORMAT,
+            "disable_list_inference_orc_format.py": files.DISABLE_LIST_INFERENCE_ORC_FORMAT,
+        }
+
+    def test_models_success(self, project, models):
+        result, output = run_dbt_and_capture(["run"])
+        assert len(result) == len(models)