Skip to content

Commit 9fa0eb1

Browse files
Fix disable_samples to exclude columns instead of skipping sampling
- Remove early return that skipped sampling entirely for disable_samples - Create unified get_columns_to_exclude_from_sampling helper macro - Integrate disable_samples with existing PII column exclusion logic - Update integration tests to verify column exclusion behavior - Add test for interaction between PII and disable_samples columns - Maintain backward compatibility with existing functionality Addresses GitHub feedback on PR #833 Co-Authored-By: Yosef Arbiv <[email protected]>
1 parent a159aa7 commit 9fa0eb1

File tree

3 files changed

+115
-17
lines changed

3 files changed

+115
-17
lines changed

integration_tests/tests/test_column_pii_sampling.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ def test_column_pii_sampling_all_columns_pii(test_id: str, dbt_project: DbtProje
126126

127127
assert len(samples) == TEST_SAMPLE_ROW_COUNT
128128
for sample in samples:
129-
assert "_no_non_pii_columns" in sample
130-
assert sample["_no_non_pii_columns"] == 1
129+
assert "_no_non_excluded_columns" in sample
130+
assert sample["_no_non_excluded_columns"] == 1
131131
assert SENSITIVE_COLUMN not in sample
132132
assert SAFE_COLUMN not in sample

integration_tests/tests/test_disable_samples_config.py

Lines changed: 89 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,11 @@ def test_disable_samples_config_prevents_sampling(
5151
json.loads(row["result_row"])
5252
for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id))
5353
]
54-
assert len(samples) == 0
54+
assert len(samples) == 5
55+
for sample in samples:
56+
assert "_no_non_excluded_columns" in sample
57+
assert sample["_no_non_excluded_columns"] == 1
58+
assert COLUMN_NAME not in sample
5559

5660

5761
@pytest.mark.skip_targets(["clickhouse"])
@@ -84,7 +88,9 @@ def test_disable_samples_false_allows_sampling(test_id: str, dbt_project: DbtPro
8488
for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id))
8589
]
8690
assert len(samples) == 5
87-
assert all([row == {COLUMN_NAME: None} for row in samples])
91+
for sample in samples:
92+
assert COLUMN_NAME in sample
93+
assert sample[COLUMN_NAME] is None
8894

8995

9096
@pytest.mark.skip_targets(["clickhouse"])
@@ -119,4 +125,84 @@ def test_disable_samples_config_overrides_pii_tags(
119125
json.loads(row["result_row"])
120126
for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id))
121127
]
122-
assert len(samples) == 0
128+
assert len(samples) == 5
129+
for sample in samples:
130+
assert "_no_non_excluded_columns" in sample
131+
assert sample["_no_non_excluded_columns"] == 1
132+
assert COLUMN_NAME not in sample
133+
134+
135+
@pytest.mark.skip_targets(["clickhouse"])
136+
def test_disable_samples_and_pii_interaction(test_id: str, dbt_project: DbtProject):
137+
"""Test that disable_samples and PII columns both get excluded"""
138+
data = [
139+
{"col1": None, "col2": f"pii{i}", "col3": f"disabled{i}"} for i in range(10)
140+
]
141+
142+
columns = [
143+
{"name": "col1", "tests": [{"not_null": {}}]},
144+
{"name": "col2", "config": {"tags": ["pii"]}},
145+
{"name": "col3", "config": {"disable_samples": True}},
146+
]
147+
148+
test_result = dbt_project.test(
149+
test_id,
150+
"not_null",
151+
columns=columns,
152+
data=data,
153+
test_vars={
154+
"enable_elementary_test_materialization": True,
155+
"test_sample_row_count": 5,
156+
"disable_samples_on_pii_columns": True,
157+
"pii_column_tags": ["pii"],
158+
},
159+
)
160+
assert test_result["status"] == "fail"
161+
162+
samples = [
163+
json.loads(row["result_row"])
164+
for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id))
165+
]
166+
167+
assert len(samples) == 5
168+
for sample in samples:
169+
assert "col1" in sample
170+
assert "col2" not in sample
171+
assert "col3" not in sample
172+
173+
174+
@pytest.mark.skip_targets(["clickhouse"])
175+
def test_disable_samples_with_multiple_columns(test_id: str, dbt_project: DbtProject):
176+
"""Test that disable_samples excludes only the disabled column"""
177+
data = [{"col1": None, "col2": f"value{i}"} for i in range(10)]
178+
179+
columns = [
180+
{
181+
"name": "col1",
182+
"config": {"disable_samples": True},
183+
"tests": [{"not_null": {}}],
184+
},
185+
{"name": "col2"},
186+
]
187+
188+
test_result = dbt_project.test(
189+
test_id,
190+
"not_null",
191+
columns=columns,
192+
data=data,
193+
test_vars={
194+
"enable_elementary_test_materialization": True,
195+
"test_sample_row_count": 5,
196+
},
197+
)
198+
assert test_result["status"] == "fail"
199+
200+
samples = [
201+
json.loads(row["result_row"])
202+
for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id))
203+
]
204+
205+
assert len(samples) == 5
206+
for sample in samples:
207+
assert "col1" not in sample
208+
assert "col2" in sample

macros/edr/materializations/test/test.sql

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -113,18 +113,10 @@
113113
{% do return([]) %}
114114
{% endif %}
115115

116-
{% if flattened_test and elementary.is_sampling_disabled_for_column(flattened_test) %}
117-
{% do elementary.debug_log("Skipping sample query because disable_samples is true for this column.") %}
118-
{% do return([]) %}
119-
{% endif %}
120-
121-
{% set pii_columns = [] %}
122-
{% if flattened_test %}
123-
{% set pii_columns = elementary.get_pii_columns_from_parent_model(flattened_test) %}
124-
{% endif %}
116+
{% set columns_to_exclude = elementary.get_columns_to_exclude_from_sampling(flattened_test) %}
125117

126118
{% set select_clause = "*" %}
127-
{% if pii_columns %}
119+
{% if columns_to_exclude %}
128120
{% set query_to_get_columns %}
129121
with test_results as (
130122
{{ sql }}
@@ -133,11 +125,11 @@
133125
{% endset %}
134126
{% set columns_result = elementary.run_query(query_to_get_columns) %}
135127
{% set all_columns = columns_result.column_names %}
136-
{% set safe_columns = all_columns | reject("in", pii_columns) | list %}
128+
{% set safe_columns = all_columns | reject("in", columns_to_exclude) | list %}
137129
{% if safe_columns %}
138130
{% set select_clause = safe_columns | join(", ") %}
139131
{% else %}
140-
{% set select_clause = "1 as _no_non_pii_columns" %}
132+
{% set select_clause = "1 as _no_non_excluded_columns" %}
141133
{% endif %}
142134
{% endif %}
143135

@@ -150,6 +142,26 @@
150142
{% do return(elementary.agate_to_dicts(elementary.run_query(query))) %}
151143
{% endmacro %}
152144

145+
{% macro get_columns_to_exclude_from_sampling(flattened_test) %}
146+
{% set columns_to_exclude = [] %}
147+
148+
{% if not flattened_test %}
149+
{% do return(columns_to_exclude) %}
150+
{% endif %}
151+
152+
{% set pii_columns = elementary.get_pii_columns_from_parent_model(flattened_test) %}
153+
{% set columns_to_exclude = columns_to_exclude + pii_columns %}
154+
155+
{% if elementary.is_sampling_disabled_for_column(flattened_test) %}
156+
{% set test_column_name = elementary.insensitive_get_dict_value(flattened_test, 'test_column_name') %}
157+
{% if test_column_name and test_column_name not in columns_to_exclude %}
158+
{% do columns_to_exclude.append(test_column_name) %}
159+
{% endif %}
160+
{% endif %}
161+
162+
{% do return(columns_to_exclude) %}
163+
{% endmacro %}
164+
153165
{% macro is_sampling_disabled_for_column(flattened_test) %}
154166
{% set test_column_name = elementary.insensitive_get_dict_value(flattened_test, 'test_column_name') %}
155167
{% set parent_model_unique_id = elementary.insensitive_get_dict_value(flattened_test, 'parent_model_unique_id') %}

0 commit comments

Comments
 (0)