Merge remote-tracking branch 'origin/master' into devin-ELE-4850-1753859795

arbiv · arbiv · commit c45cba366063 · 2025-07-31T18:12:06.000+03:00
diff --git a/.github/workflows/test-warehouse.yml b/.github/workflows/test-warehouse.yml
@@ -123,7 +123,7 @@ jobs:
         run: |
           mkdir -p ~/.dbt
           DBT_VERSION=$(pip show dbt-core | grep -i version | awk '{print $2}' | sed 's/\.//g')
-          UNDERSCORED_REF_NAME=$(echo "${{ inputs.warehouse-type }}_dbt_${DBT_VERSION}_${BRANCH_NAME}" | awk '{print tolower($0)}' | head -c 40 | sed "s/-/_/g")
+          UNDERSCORED_REF_NAME=$(echo "${{ inputs.warehouse-type }}_dbt_${DBT_VERSION}_${BRANCH_NAME}" | awk '{print tolower($0)}' | head -c 40 | sed "s/[-\/]/_/g")
           echo "$PROFILES_YML" | base64 -d | sed "s/<SCHEMA_NAME>/dbt_pkg_$UNDERSCORED_REF_NAME/g" > ~/.dbt/profiles.yml
 
       - name: Check DWH connection
diff --git a/integration_tests/tests/dbt_project.py b/integration_tests/tests/dbt_project.py
@@ -109,6 +109,7 @@ def test(
         materialization: str = "table",  # Only relevant if as_model=True
         test_vars: Optional[dict] = None,
         elementary_enabled: bool = True,
+        model_config: Optional[Dict[str, Any]] = None,
         *,
         multiple_results: Literal[False] = False,
     ) -> Dict[str, Any]:
@@ -128,6 +129,7 @@ def test(
         materialization: str = "table",  # Only relevant if as_model=True
         test_vars: Optional[dict] = None,
         elementary_enabled: bool = True,
+        model_config: Optional[Dict[str, Any]] = None,
         *,
         multiple_results: Literal[True],
     ) -> List[Dict[str, Any]]:
@@ -146,6 +148,7 @@ def test(
         materialization: str = "table",  # Only relevant if as_model=True
         test_vars: Optional[dict] = None,
         elementary_enabled: bool = True,
+        model_config: Optional[Dict[str, Any]] = None,
         *,
         multiple_results: bool = False,
     ) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
@@ -161,6 +164,9 @@ def test(
         test_args = test_args or {}
         table_yaml: Dict[str, Any] = {"name": test_id}
 
+        if model_config:
+            table_yaml.update(model_config)
+
         if columns:
             table_yaml["columns"] = columns
 
diff --git a/integration_tests/tests/test_sampling_pii.py b/integration_tests/tests/test_sampling_pii.py
@@ -0,0 +1,232 @@
+import json
+
+import pytest
+from dbt_project import DbtProject
+
+COLUMN_NAME = "value"
+
+
+SAMPLES_QUERY = """
+    with latest_elementary_test_result as (
+        select id
+        from {{{{ ref("elementary_test_results") }}}}
+        where lower(table_name) = lower('{test_id}')
+        order by created_at desc, id desc
+        limit 1
+    )
+
+    select result_row
+    from {{{{ ref("test_result_rows") }}}}
+    where elementary_test_results_id in (select * from latest_elementary_test_result)
+"""
+
+TEST_SAMPLE_ROW_COUNT = 7
+
+
+@pytest.mark.skip_targets(["clickhouse"])
+def test_sampling_pii_disabled(test_id: str, dbt_project: DbtProject):
+    """Test that PII-tagged tables don't upload samples even when tests fail"""
+    null_count = 50
+    data = [{COLUMN_NAME: None} for _ in range(null_count)]
+
+    test_result = dbt_project.test(
+        test_id,
+        "not_null",
+        dict(column_name=COLUMN_NAME),
+        data=data,
+        as_model=True,
+        model_config={"config": {"tags": ["pii"]}},
+        test_vars={
+            "enable_elementary_test_materialization": True,
+            "test_sample_row_count": TEST_SAMPLE_ROW_COUNT,
+            "disable_samples_on_pii_tags": True,
+            "pii_tags": ["pii", "sensitive"],
+        },
+    )
+    assert test_result["status"] == "fail"
+
+    samples = [
+        json.loads(row["result_row"])
+        for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id))
+    ]
+    assert len(samples) == 0
+
+
+@pytest.mark.skip_targets(["clickhouse"])
+def test_sampling_pii_disabled_with_default_config_and_casing(
+    test_id: str, dbt_project: DbtProject
+):
+    null_count = 50
+    data = [{COLUMN_NAME: None} for _ in range(null_count)]
+
+    test_result = dbt_project.test(
+        test_id,
+        "not_null",
+        dict(column_name=COLUMN_NAME),
+        data=data,
+        as_model=True,
+        model_config={"config": {"tags": ["pIi"]}},
+        test_vars={
+            "enable_elementary_test_materialization": True,
+            "test_sample_row_count": TEST_SAMPLE_ROW_COUNT,
+            "disable_samples_on_pii_tags": True,
+        },
+    )
+    assert test_result["status"] == "fail"
+
+    samples = [
+        json.loads(row["result_row"])
+        for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id))
+    ]
+    assert len(samples) == 0
+
+
+@pytest.mark.skip_targets(["clickhouse"])
+def test_sampling_pii_enabled_with_default_config(
+    test_id: str, dbt_project: DbtProject
+):
+    """Test that PII-tagged tables don't upload samples even when tests fail"""
+    null_count = 50
+    data = [{COLUMN_NAME: None} for _ in range(null_count)]
+
+    test_result = dbt_project.test(
+        test_id,
+        "not_null",
+        dict(column_name=COLUMN_NAME),
+        data=data,
+        as_model=True,
+        model_config={"config": {"tags": ["pii"]}},
+        test_vars={
+            "enable_elementary_test_materialization": True,
+            "test_sample_row_count": TEST_SAMPLE_ROW_COUNT,
+        },
+    )
+    assert test_result["status"] == "fail"
+
+    samples = [
+        json.loads(row["result_row"])
+        for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id))
+    ]
+    assert len(samples) == TEST_SAMPLE_ROW_COUNT
+
+
+@pytest.mark.skip_targets(["clickhouse"])
+def test_sampling_non_pii_enabled(test_id: str, dbt_project: DbtProject):
+    """Test that non-PII tables still collect samples normally"""
+    null_count = 50
+    data = [{COLUMN_NAME: None} for _ in range(null_count)]
+
+    test_result = dbt_project.test(
+        test_id,
+        "not_null",
+        dict(column_name=COLUMN_NAME),
+        data=data,
+        as_model=True,
+        model_config={"config": {"tags": ["normal"]}},
+        test_vars={
+            "enable_elementary_test_materialization": True,
+            "test_sample_row_count": TEST_SAMPLE_ROW_COUNT,
+            "disable_samples_on_pii_tags": True,
+            "pii_tags": ["pii", "sensitive"],
+        },
+    )
+    assert test_result["status"] == "fail"
+
+    samples = [
+        json.loads(row["result_row"])
+        for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id))
+    ]
+    assert len(samples) == TEST_SAMPLE_ROW_COUNT
+
+
+@pytest.mark.skip_targets(["clickhouse"])
+def test_sampling_pii_feature_disabled(test_id: str, dbt_project: DbtProject):
+    """Test that when PII feature is disabled, PII tables still collect samples"""
+    null_count = 50
+    data = [{COLUMN_NAME: None} for _ in range(null_count)]
+
+    test_result = dbt_project.test(
+        test_id,
+        "not_null",
+        dict(column_name=COLUMN_NAME),
+        data=data,
+        as_model=True,
+        model_config={"config": {"tags": ["pii"]}},
+        test_vars={
+            "enable_elementary_test_materialization": True,
+            "test_sample_row_count": TEST_SAMPLE_ROW_COUNT,
+            "disable_samples_on_pii_tags": False,
+            "pii_tags": ["pii", "sensitive"],
+        },
+    )
+    assert test_result["status"] == "fail"
+
+    samples = [
+        json.loads(row["result_row"])
+        for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id))
+    ]
+    assert len(samples) == TEST_SAMPLE_ROW_COUNT
+
+
+@pytest.mark.skip_targets(["clickhouse"])
+def test_sampling_disable_samples_overrides_pii(test_id: str, dbt_project: DbtProject):
+    """Test that disable_test_samples flag overrides PII detection when both are present"""
+    null_count = 50
+    data = [{COLUMN_NAME: None} for _ in range(null_count)]
+
+    test_result = dbt_project.test(
+        test_id,
+        "not_null",
+        dict(column_name=COLUMN_NAME),
+        data=data,
+        as_model=True,
+        model_config={
+            "config": {"meta": {"disable_test_samples": True}, "tags": ["pii"]}
+        },
+        test_vars={
+            "enable_elementary_test_materialization": True,
+            "test_sample_row_count": TEST_SAMPLE_ROW_COUNT,
+            "disable_samples_on_pii_tags": True,
+            "pii_tags": ["pii"],
+        },
+    )
+    assert test_result["status"] == "fail"
+
+    samples = [
+        json.loads(row["result_row"])
+        for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id))
+    ]
+    assert len(samples) == 0
+
+
+@pytest.mark.skip_targets(["clickhouse"])
+def test_sampling_disable_samples_false_allows_samples(
+    test_id: str, dbt_project: DbtProject
+):
+    """Test that disable_test_samples: false allows sample collection normally"""
+    null_count = 50
+    data = [{COLUMN_NAME: None} for _ in range(null_count)]
+
+    test_result = dbt_project.test(
+        test_id,
+        "not_null",
+        dict(column_name=COLUMN_NAME),
+        data=data,
+        as_model=True,
+        model_config={
+            "config": {"meta": {"disable_test_samples": False}, "tags": ["normal"]}
+        },
+        test_vars={
+            "enable_elementary_test_materialization": True,
+            "test_sample_row_count": TEST_SAMPLE_ROW_COUNT,
+            "disable_samples_on_pii_tags": False,
+            "pii_tags": ["pii"],
+        },
+    )
+    assert test_result["status"] == "fail"
+
+    samples = [
+        json.loads(row["result_row"])
+        for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id))
+    ]
+    assert len(samples) == TEST_SAMPLE_ROW_COUNT
diff --git a/macros/edr/materializations/test/test.sql b/macros/edr/materializations/test/test.sql
@@ -50,7 +50,20 @@
 
 {% macro handle_dbt_test(flattened_test, materialization_macro) %}
   {% set result = materialization_macro() %}
-  {% set result_rows = elementary.query_test_result_rows(sample_limit=elementary.get_config_var('test_sample_row_count'),
+  {% set sample_limit = elementary.get_config_var('test_sample_row_count') %}
+  
+  {% set disable_test_samples = false %}
+  {% if "meta" in flattened_test and "disable_test_samples" in flattened_test["meta"] %}
+    {% set disable_test_samples = flattened_test["meta"]["disable_test_samples"] %}
+  {% endif %}
+  
+  {% if disable_test_samples %}
+    {% set sample_limit = 0 %}
+  {% elif elementary.is_pii_table(flattened_test) %}
+    {% set sample_limit = 0 %}
+  {% endif %}
+  
+  {% set result_rows = elementary.query_test_result_rows(sample_limit=sample_limit,
                                                          ignore_passed_tests=true,
                                                          flattened_test=flattened_test) %}
   {% set elementary_test_results_row = elementary.get_dbt_test_result_row(flattened_test, result_rows) %}
diff --git a/macros/edr/system/system_utils/get_config_var.sql b/macros/edr/system/system_utils/get_config_var.sql
@@ -83,7 +83,9 @@
     },
     'include_other_warehouse_specific_columns': false,
     'fail_on_zero': false,
-    'anomaly_exclude_metrics': none
+    'anomaly_exclude_metrics': none,
+    'disable_samples_on_pii_tags': false,
+    'pii_tags': ['pii']
   } %}
   {{- return(default_config) -}}
 {%- endmacro -%}
diff --git a/macros/edr/system/system_utils/is_pii_table.sql b/macros/edr/system/system_utils/is_pii_table.sql
@@ -0,0 +1,17 @@
+{% macro is_pii_table(flattened_test) %}
+  {% set disable_samples_on_pii_tags = elementary.get_config_var('disable_samples_on_pii_tags') %}
+  {% if not disable_samples_on_pii_tags %}
+    {% do return(false) %}
+  {% endif %}
+  
+  {% set raw_pii_tags = elementary.get_config_var('pii_tags') %}
+  {% set pii_tags = (raw_pii_tags if raw_pii_tags is iterable else [raw_pii_tags]) | map('lower') | list %}
+  
+  {% set raw_model_tags = elementary.insensitive_get_dict_value(flattened_test, 'model_tags', []) %}
+  {% set model_tags = (raw_model_tags if raw_model_tags is iterable else [raw_model_tags]) | map('lower') | list %}
+  
+  {% set intersection = elementary.lists_intersection(model_tags, pii_tags) %}
+  {% set is_pii = intersection | length > 0 %}
+  
+  {% do return(is_pii) %}
+{% endmacro %}