Skip to content

Commit 87a9d23

Browse files
committed
Add AI data validation macro for multiple cloud platforms
This commit introduces a new macro for AI-powered data validation across Snowflake, Databricks, and BigQuery. The implementation includes: - A generic AI data validation test macro - Platform-specific implementations for generating AI validation queries - Support for different LLM models and validation prompts
1 parent 83f3da5 commit 87a9d23

File tree

2 files changed

+88
-0
lines changed

2 files changed

+88
-0
lines changed
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
{% test ai_data_validation(model, column_name, expectation_prompt, context='', llm_model_name='claude-3-5-sonnet') %}
2+
{{ config(tags = ['elementary-tests']) }}
3+
{%- if execute and elementary.is_test_command() and elementary.is_elementary_enabled() %}
4+
{% set model_relation = elementary.get_model_relation_for_test(model, context["model"]) %}
5+
{% if not model_relation %}
6+
{{ exceptions.raise_compiler_error("Unsupported model: " ~ model ~ " (this might happen if you override 'ref' or 'source')") }}
7+
{% endif %}
8+
9+
{%- set full_table_name = elementary.relation_to_full_name(model_relation) %}
10+
11+
{# Prompt to supply to the LLM #}
12+
{% set context_part = context ~ " " if context else "" %}
13+
{% set prompt_template = "You are a data validator that should reply with string true if the expectation is met or the string false otherwise. " ~ context_part ~ "You got the following expectation: " ~ expectation_prompt ~ ". Your only role is to determine if the following text meets this expectation: " %}
14+
15+
{{ elementary.generate_ai_data_validation_sql(model, column_name, prompt_template, llm_model_name) }}
16+
17+
{%- else %}
18+
19+
{#- test must run an sql query -#}
20+
{{ elementary.no_results_query() }}
21+
22+
{%- endif %}
23+
{% endtest %}
24+
25+
26+
{% macro generate_ai_data_validation_sql(model, column_name, prompt_template, llm_model_name) %}
27+
{{ return(adapter.dispatch('generate_ai_data_validation_sql', 'elementary')(model, column_name, prompt_template, llm_model_name)) }}
28+
{% endmacro %}
29+
30+
{% macro default__generate_ai_data_validation_sql(model, column_name, prompt_template, llm_model_name) %}
31+
{{ exceptions.raise_compiler_error("AI data validation is not supported for target: " ~ target.type) }}
32+
{% endmacro %}
33+
34+
{% macro snowflake__generate_ai_data_validation_sql(model, column_name, prompt_template, llm_model_name) %}
35+
with ai_data_validation_results as (
36+
select
37+
snowflake.cortex.complete(
38+
'{{ llm_model_name }}',
39+
concat('{{ prompt_template }}', {{ column_name }}::text)
40+
) as result
41+
from {{ model }}
42+
)
43+
44+
select *
45+
from ai_data_validation_results
46+
where lower(result) like '%false%'
47+
{% endmacro %}
48+
49+
{% macro databricks__generate_ai_data_validation_sql(model, column_name, prompt_template, llm_model_name='databricks-meta-llama-3-3-70b-instruct') %}
50+
with ai_data_validation_results as (
51+
select
52+
ai_query(
53+
'{{ llm_model_name }}',
54+
concat('{{ prompt_template }}', cast({{ column_name }} as string))
55+
) as result
56+
from {{ model }}
57+
)
58+
59+
select *
60+
from ai_data_validation_results
61+
where lower(result) like '%false%'
62+
{% endmacro %}
63+
64+
65+
{% macro bigquery__generate_ai_data_validation_sql(model, column_name, prompt_template, llm_model_name='gemini-1.5-pro') %}
66+
with ai_data_validation_results as (
67+
SELECT ml_generate_text_llm_result as result
68+
FROM
69+
ML.GENERATE_TEXT(
70+
MODEL `{{model.schema}}.{{llm_model_name}}`,
71+
(
72+
SELECT
73+
CONCAT(
74+
'{{ prompt_template }}',
75+
{{column_name}}) AS prompt
76+
FROM {{model}}),
77+
STRUCT(TRUE AS flatten_json_output))
78+
)
79+
80+
select *
81+
from ai_data_validation_results
82+
where lower(result) like '%false%'
83+
{% endmacro %}
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{% test unstructured_data_validation(model, column_name, expectation_prompt, llm_model_name='claude-3-5-sonnet') %}
2+
{{ config(tags = ['elementary-tests']) }}
3+
{% set context = "You are a data validator specializing in validating unstructured data." %}
4+
{{ return(elementary.test_ai_data_validation(model, column_name, expectation_prompt, context, llm_model_name)) }}
5+
{% endtest %}

0 commit comments

Comments
 (0)