fixed null insertion

Michael Myaskovsky · Michael Myaskovsky · commit f27e8d796288 · 2025-04-01T10:55:44.000+03:00
diff --git a/dbt_project.yml b/dbt_project.yml
@@ -3,9 +3,6 @@ version: "0.18.1"
 
 require-dbt-version: [">=1.0.0", "<2.0.0"]
 
-flags:
-  require_explicit_package_overrides_for_builtin_materializations: false
-  
 config-version: 2
 profile: "elementary"
 
diff --git a/macros/edr/data_monitoring/anomaly_detection/get_anomaly_scores_query.sql b/macros/edr/data_monitoring/anomaly_detection/get_anomaly_scores_query.sql
@@ -164,7 +164,7 @@
                 bucket_duration_hours,
                 updated_at,
                 avg(metric_value) over (partition by metric_name, full_table_name, column_name, dimension, dimension_value, bucket_seasonality order by bucket_end asc rows between unbounded preceding and current row) as training_avg,
-                stddev(metric_value) over (partition by metric_name, full_table_name, column_name, dimension, dimension_value, bucket_seasonality order by bucket_end asc rows between unbounded preceding and current row) as training_stddev,
+                {{ elementary.standard_deviation('metric_value') }} over (partition by metric_name, full_table_name, column_name, dimension, dimension_value, bucket_seasonality order by bucket_end asc rows between unbounded preceding and current row) as training_stddev,
                 count(metric_value) over (partition by metric_name, full_table_name, column_name, dimension, dimension_value, bucket_seasonality order by bucket_end asc rows between unbounded preceding and current row) as training_set_size,
                 last_value(bucket_end) over (partition by metric_name, full_table_name, column_name, dimension, dimension_value, bucket_seasonality order by bucket_end asc rows between unbounded preceding and current row) training_end,
                 first_value(bucket_end) over (partition by metric_name, full_table_name, column_name, dimension, dimension_value, bucket_seasonality order by bucket_end asc rows between unbounded preceding and current row) as training_start
diff --git a/macros/edr/data_monitoring/monitors_query/dimension_monitoring_query.sql b/macros/edr/data_monitoring/monitors_query/dimension_monitoring_query.sql
@@ -64,7 +64,7 @@
             select edr_bucket_start, edr_bucket_end, dimension_value
             from training_set_dimensions left join buckets
                 on (buckets.joiner = training_set_dimensions.joiner
-                {# This makes sure we don't create empty buckets for dimensions before their first appearance #}
+                {# This makes sure we dont create empty buckets for dimensions before their first appearance #}
                 and edr_bucket_end >= dimension_min_bucket_end)
             where dimension_value is not null
         ),
@@ -202,13 +202,12 @@
                 {{ elementary.null_timestamp() }} as bucket_start,
                 bucket_end,
                 {{ elementary.null_int() }} as bucket_duration_hours,
-                {{ elementary.const_as_string(dimensions_string) }} as dimension,
-                dimension_value,
+                {{ elementary.null_string() }} as dimension,
+                {{ elementary.null_string() }} as dimension_value,
                 {{ elementary.dict_to_quoted_json(metric_properties) }} as metric_properties
             from row_count
         )
     {% endif %}
-
     select
         {{ elementary.generate_surrogate_key([
             'full_table_name',
diff --git a/macros/edr/system/system_utils/empty_table.sql b/macros/edr/system/system_utils/empty_table.sql
@@ -54,6 +54,10 @@
 {% endmacro %}
 
 {% macro empty_data_monitoring_metrics(with_created_at=true) %}
+    {{ return(adapter.dispatch('empty_data_monitoring_metrics', 'elementary')(with_created_at)) }}
+{% endmacro %}
+
+{% macro default__empty_data_monitoring_metrics(with_created_at=true) %}
     {% set columns = [('id','string'),
                       ('full_table_name','string'),
                       ('column_name','string'),
@@ -75,6 +79,27 @@
     {{ elementary.empty_table(columns) }}
 {% endmacro %}
 
+{% macro clickhouse__empty_data_monitoring_metrics(with_created_at=true) %}
+    {% set columns = [('id','string'),
+                      ('full_table_name','nullable(string)'),
+                      ('column_name','nullable(string)'),
+                      ('metric_name','nullable(string)'),
+                      ('metric_type','nullable(string)'),
+                      ('metric_value','nullable(float)'),
+                      ('source_value','nullable(string)'),
+                      ('bucket_start','timestamp'),
+                      ('bucket_end','timestamp'),
+                      ('bucket_duration_hours','nullable(int)'),
+                      ('updated_at','nullable(timestamp)'),
+                      ('dimension','nullable(string)'),
+                      ('dimension_value','nullable(string)'),
+                      ('metric_properties','string')]
+    %}
+    {% if with_created_at %}
+        {% do columns.append(('created_at','nullable(timestamp)')) %}
+    {% endif %}
+    {{ elementary.empty_table(columns) }}
+{% endmacro %}
 
 {% macro empty_schema_columns_snapshot() %}
     {{ elementary.empty_table([('column_state_id','string'),('full_column_name','string'),('full_table_name','string'),('column_name','string'),('data_type','string'),('is_new','boolean'),('detected_at','timestamp'),('created_at','timestamp')]) }}
@@ -124,6 +149,14 @@
         cast({{ dummy_values['float'] }} as {{ elementary.edr_type_float() }}) as {{ column_name }}
     {%- elif data_type == 'long_string' %}
         cast('{{ dummy_values['long_string'] }}' as {{ elementary.edr_type_long_string() }}) as {{ column_name }}
+    {%- elif data_type == 'nullable(string)' %}
+        cast('{{ dummy_values['string'] }}' as Nullable({{ elementary.edr_type_string() }})) as {{ column_name }}
+    {%- elif data_type == 'nullable(timestamp)' -%}
+        cast('{{ dummy_values['timestamp'] }}' as Nullable({{ elementary.edr_type_timestamp() }})) as {{ column_name }}
+    {%- elif data_type == 'nullable(float)' -%}
+        cast({{ dummy_values['float'] }} as Nullable({{ elementary.edr_type_float() }})) as {{ column_name }}
+    {%- elif data_type == 'nullable(int)' -%}
+        cast({{ dummy_values['int'] }} as Nullable({{ elementary.edr_type_int() }})) as {{ column_name }}
     {%- else %}
         cast('{{ dummy_values['string'] }}' as {{ elementary.edr_type_string() }}) as {{ column_name }}
     {%- endif %}
diff --git a/macros/edr/system/system_utils/get_config_var.sql b/macros/edr/system/system_utils/get_config_var.sql
@@ -75,6 +75,12 @@
     {{- return(default_config) -}}
 {%- endmacro -%}
 
+{%- macro clickhouse__get_default_config() -%}
+    {% set default_config = elementary.default__get_default_config() %}
+    {% do default_config.update({'query_max_size': 250000}) %}
+    {{- return(default_config) -}}
+{%- endmacro -%}
+
 {%- macro athena__get_default_config() -%}
     {% set default_config = elementary.default__get_default_config() %}
     {% do default_config.update({'query_max_size': 250000}) %}
diff --git a/macros/edr/tests/test_column_anomalies.sql b/macros/edr/tests/test_column_anomalies.sql
@@ -80,7 +80,6 @@
                                                                              dimensions) %}
         {{ elementary.debug_log('column_monitoring_query - \n' ~ column_monitoring_query) }}
         {% set temp_table_relation = elementary.create_elementary_test_table(database_name, tests_schema_name, test_table_name, 'metrics', column_monitoring_query) %}
-
         {#- calculate anomaly scores for metrics -#}
         {% set anomaly_scores_query = elementary.get_anomaly_scores_query(test_metrics_table_relation=temp_table_relation,
                                                                           model_relation=model_relation,
diff --git a/macros/edr/tests/test_dimension_anomalies.sql b/macros/edr/tests/test_dimension_anomalies.sql
@@ -1,6 +1,6 @@
 {% test dimension_anomalies(model, dimensions, timestamp_column, where_expression, anomaly_sensitivity, anomaly_direction, min_training_set_size, time_bucket, days_back, backfill_days, seasonality, sensitivity,ignore_small_changes, fail_on_zero, detection_delay, anomaly_exclude_metrics, detection_period, training_period, exclude_final_results) %}
     {{ config(tags = ['elementary-tests']) }}
-    {%- if execute and elementary.is_test_command() and elementary.is_elementary_enabled() %}
+    {%- if execute and elementary.is_test_command()%}
         {% set model_relation = elementary.get_model_relation_for_test(model, context["model"]) %}
         {% if not model_relation %}
             {{ exceptions.raise_compiler_error("Unsupported model: " ~ model ~ " (this might happen if you override 'ref' or 'source')") }}
@@ -58,9 +58,7 @@
 
         {%- set dimension_monitoring_query = elementary.dimension_monitoring_query(model, model_relation, metric_properties.dimensions, min_bucket_start, max_bucket_end, metric_properties) %}
         {{ elementary.debug_log('dimension_monitoring_query - \n' ~ dimension_monitoring_query) }}
-
         {% set temp_table_relation = elementary.create_elementary_test_table(database_name, tests_schema_name, test_table_name, 'metrics', dimension_monitoring_query) %}
-
         {#- calculate anomaly scores for metrics -#}
         {% set anomaly_scores_query = elementary.get_anomaly_scores_query(test_metrics_table_relation=temp_table_relation,
                                                                           model_relation=model_relation,
diff --git a/macros/edr/tests/test_table_anomalies.sql b/macros/edr/tests/test_table_anomalies.sql
@@ -1,6 +1,6 @@
 {% test table_anomalies(model, table_anomalies, timestamp_column, where_expression, anomaly_sensitivity, anomaly_direction, min_training_set_size, time_bucket, days_back, backfill_days, seasonality, mandatory_params=none, event_timestamp_column=none, freshness_column=none, sensitivity=none, ignore_small_changes={"spike_failure_percent_threshold": none, "drop_failure_percent_threshold": none}, fail_on_zero=false, detection_delay=none, anomaly_exclude_metrics=none, detection_period=none, training_period=none) %}
     {{ config(tags = ['elementary-tests']) }}
-    {%- if execute and elementary.is_test_command() and elementary.is_elementary_enabled() %}
+    {%- if execute and elementary.is_test_command() %}
         {% set model_relation = elementary.get_model_relation_for_test(model, context["model"]) %}
         {% if not model_relation %}
             {{ exceptions.raise_compiler_error("The test has unsupported configuration, please contact Elementary support") }}
@@ -71,7 +71,6 @@
                                                                            metric_properties=metric_properties) %}
         {{ elementary.debug_log('table_monitoring_query - \n' ~ table_monitoring_query) }}
         {% set temp_table_relation = elementary.create_elementary_test_table(database_name, tests_schema_name, test_table_name, 'metrics', table_monitoring_query) %}
-
         {#- calculate anomaly scores for metrics -#}
         {% set anomaly_scores_query = elementary.get_anomaly_scores_query(temp_table_relation,
                                                                           model_relation,
diff --git a/macros/edr/tests/test_utils/get_anomaly_query.sql b/macros/edr/tests/test_utils/get_anomaly_query.sql
@@ -60,38 +60,77 @@ case when
         from anomaly_scores
       ),
 
-      final_results as (
-          select
-          metric_value as value,
-          training_avg as average,
-          {# when there is an anomaly we would want to use the last value of the metric (lag), otherwise visually the expectations would look out of bounds #}
-          case
-          when is_anomalous = TRUE and '{{ test_configuration.anomaly_direction }}' = 'spike' then
-          lag(metric_value) over (partition by full_table_name, column_name, metric_name, dimension, dimension_value, bucket_seasonality order by bucket_end)
-          when is_anomalous = TRUE and '{{ test_configuration.anomaly_direction }}' != 'spike' then
-          lag(min_metric_value) over (partition by full_table_name, column_name, metric_name, dimension, dimension_value, bucket_seasonality order by bucket_end)
-          when '{{ test_configuration.anomaly_direction }}' = 'spike' then metric_value
-          else min_metric_value end as min_value,
-          case
-          when is_anomalous = TRUE and '{{ test_configuration.anomaly_direction }}' = 'drop' then
-          lag(metric_value) over (partition by full_table_name, column_name, metric_name, dimension, dimension_value, bucket_seasonality order by bucket_end)
-          when is_anomalous = TRUE and '{{ test_configuration.anomaly_direction }}' != 'drop' then
-          lag(max_metric_value) over (partition by full_table_name, column_name, metric_name, dimension, dimension_value, bucket_seasonality order by bucket_end)
-          when '{{ test_configuration.anomaly_direction }}' = 'drop' then metric_value
-          else max_metric_value end as max_value,
-          bucket_start as start_time,
-          bucket_end as end_time,
-          *
-        from anomaly_scores_with_is_anomalous
-        order by bucket_end, dimension_value
-      )
+      {{ elementary.get_final_results_query(test_configuration) }}
 
       select * from final_results
       where {{ test_configuration.exclude_final_results }}
     {%- endset -%}
     {{- return(anomaly_query) -}}
 {% endmacro %}
 
+{% macro get_final_results_query(test_configuration) %}
+    {{ return(adapter.dispatch('get_final_results_query', 'elementary')(test_configuration)) }}
+{% endmacro %}
+
+{% macro default__get_final_results_query(test_configuration) %}
+    final_results as (
+        select
+            metric_value as value,
+            training_avg as average,
+            {# when there is an anomaly we would want to use the last value of the metric (lag), otherwise visually the expectations would look out of bounds #}
+            case
+                when is_anomalous = TRUE and '{{ test_configuration.anomaly_direction }}' = 'spike' then
+                    lag(metric_value) over (partition by full_table_name, column_name, metric_name, dimension, dimension_value, bucket_seasonality order by bucket_end)
+                when is_anomalous = TRUE and '{{ test_configuration.anomaly_direction }}' != 'spike' then
+                    lag(min_metric_value) over (partition by full_table_name, column_name, metric_name, dimension, dimension_value, bucket_seasonality order by bucket_end)
+                when '{{ test_configuration.anomaly_direction }}' = 'spike' then metric_value
+                else min_metric_value 
+            end as min_value,
+            case
+                when is_anomalous = TRUE and '{{ test_configuration.anomaly_direction }}' = 'drop' then
+                    lag(metric_value) over (partition by full_table_name, column_name, metric_name, dimension, dimension_value, bucket_seasonality order by bucket_end)
+                when is_anomalous = TRUE and '{{ test_configuration.anomaly_direction }}' != 'drop' then
+                    lag(max_metric_value) over (partition by full_table_name, column_name, metric_name, dimension, dimension_value, bucket_seasonality order by bucket_end)
+                when '{{ test_configuration.anomaly_direction }}' = 'drop' then metric_value
+                else max_metric_value 
+            end as max_value,
+            bucket_start as start_time,
+            bucket_end as end_time,
+            *
+        from anomaly_scores_with_is_anomalous
+        order by bucket_end, dimension_value
+    )
+{% endmacro %}
+
+{% macro clickhouse__get_final_results_query(test_configuration) %}
+    final_results as (
+        select
+            metric_value as value,
+            training_avg as average,
+            {# when there is an anomaly we would want to use the last value of the metric (lag), otherwise visually the expectations would look out of bounds #}
+            case
+                when is_anomalous = TRUE and '{{ test_configuration.anomaly_direction }}' = 'spike' then
+                    lagInFrame(metric_value, 1) over (partition by full_table_name, column_name, metric_name, dimension, dimension_value, bucket_seasonality order by bucket_end)
+                when is_anomalous = TRUE and '{{ test_configuration.anomaly_direction }}' != 'spike' then
+                    lagInFrame(min_metric_value, 1) over (partition by full_table_name, column_name, metric_name, dimension, dimension_value, bucket_seasonality order by bucket_end)
+                when '{{ test_configuration.anomaly_direction }}' = 'spike' then metric_value
+                else min_metric_value 
+            end as min_value,
+            case
+                when is_anomalous = TRUE and '{{ test_configuration.anomaly_direction }}' = 'drop' then
+                    lagInFrame(metric_value, 1) over (partition by full_table_name, column_name, metric_name, dimension, dimension_value, bucket_seasonality order by bucket_end)
+                when is_anomalous = TRUE and '{{ test_configuration.anomaly_direction }}' != 'drop' then
+                    lagInFrame(max_metric_value, 1) over (partition by full_table_name, column_name, metric_name, dimension, dimension_value, bucket_seasonality order by bucket_end)
+                when '{{ test_configuration.anomaly_direction }}' = 'drop' then metric_value
+                else max_metric_value 
+            end as max_value,
+            bucket_start as start_time,
+            bucket_end as end_time,
+            *
+        from anomaly_scores_with_is_anomalous
+        order by bucket_end, dimension_value
+    )
+{% endmacro %}
 
 {%- macro set_directional_anomaly(anomaly_direction, anomaly_score, sensitivity) -%}
     {% if anomaly_direction | lower == 'spike' %}
diff --git a/macros/utils/cross_db_utils/generate_surrogate_key.sql b/macros/utils/cross_db_utils/generate_surrogate_key.sql
@@ -14,8 +14,11 @@
    limitations under the License.
 #}
 
-
 {%- macro generate_surrogate_key(fields) -%}
+    {{ return(adapter.dispatch('generate_surrogate_key', 'elementary')(fields)) }}
+{%- endmacro -%}
+
+{%- macro default__generate_surrogate_key(fields) -%}
   {% set concat_macro = dbt.concat or dbt_utils.concat %}
   {% set hash_macro = dbt.hash or dbt_utils.hash %}
 
@@ -30,4 +33,21 @@
     {%- endif -%}
   {%- endfor -%}
   {{ hash_macro(concat_macro(field_sqls)) }}
-{% endmacro %}
+{%- endmacro -%}
+
+{%- macro clickhouse__generate_surrogate_key(fields) -%}
+  {% set concat_macro = dbt.concat or dbt_utils.concat %}
+  {% set hash_macro = dbt.hash or dbt_utils.hash %}
+
+  {% set default_null_value = "" %}
+  {%- set field_sqls = [] -%}
+  {%- for field in fields -%}
+    {%- do field_sqls.append(
+        "coalesce(cast(" ~ field ~ " as Nullable(" ~ elementary.edr_type_string() ~ ")), '" ~ default_null_value  ~"')"
+    ) -%}
+    {%- if not loop.last %}
+        {%- do field_sqls.append("'-'") -%}
+    {%- endif -%}
+  {%- endfor -%}
+  {{ hash_macro(concat_macro(field_sqls)) }}
+{%- endmacro -%}
diff --git a/macros/utils/table_operations/create_or_replace.sql b/macros/utils/table_operations/create_or_replace.sql
@@ -35,3 +35,8 @@
     {% do dbt.drop_relation_if_exists(relation) %}
     {% do elementary.run_query(dbt.create_table_as(temporary, relation, sql_query)) %}
 {% endmacro %}
+
+{% macro clickhouse__create_or_replace(temporary, relation, sql_query) %}
+    {% do dbt.drop_relation_if_exists(relation) %}
+    {% do elementary.run_query(dbt.create_table_as(temporary, relation, sql_query)) %}
+{% endmacro %}
diff --git a/macros/utils/table_operations/get_relation_max_length.sql b/macros/utils/table_operations/get_relation_max_length.sql
@@ -30,3 +30,7 @@
 {% macro trino__get_relation_max_name_length(temporary, relation, sql_query) %}
     {{ return(128) }}
 {% endmacro %}
+
+{% macro clickhouse__get_relation_max_name_length(temporary, relation, sql_query) %}
+    {{ return(192) }}
+{% endmacro %}