diff --git a/elementary/monitor/dbt_project/macros/tests_recommendation/get_recommended_tests.sql b/elementary/monitor/dbt_project/macros/tests_recommendation/get_recommended_tests.sql deleted file mode 100644 index 4ad5922d4..000000000 --- a/elementary/monitor/dbt_project/macros/tests_recommendation/get_recommended_tests.sql +++ /dev/null @@ -1,14 +0,0 @@ -{% macro get_recommended_tests(where_expression) %} - {% if not where_expression %} - {% do exceptions.raise_compiler_error("A 'where_expression' argument is required.") %} - {% endif %} - - {% set query %} - select resource_name, source_name, test_namespace, test_name, test_args, table_args - from {{ ref("test_recommendations") }} - where {{ where_expression }} - {% endset %} - - {% set result = elementary.run_query(query) %} - {% do return(elementary.agate_to_dicts(result)) %} -{% endmacro %} diff --git a/elementary/monitor/dbt_project/models/tests_recommendation/table_timestamp_columns.sql b/elementary/monitor/dbt_project/models/tests_recommendation/table_timestamp_columns.sql deleted file mode 100644 index 5d9a3647e..000000000 --- a/elementary/monitor/dbt_project/models/tests_recommendation/table_timestamp_columns.sql +++ /dev/null @@ -1,139 +0,0 @@ -{# Prioritization: 1. insertion time, 2. update time. #} -{% set timestamp_column_names = [ - "created_at", - "created_at_utc", - "inserted_at", - "inserted_at_utc", - "create_date", - "created", - "db_insert_time", - "create_ts", - "created_ts", - "load_ts", - "loaded_at", - "date_created", - "_etl_loaded_at", - "__etl_loaded_at", - "_etl_inserted_at", - "_ingestion_time", - "_fivetran_synced", - "_airbyte_emitted_at", - - "updated_at", - "updated_at_utc", - "update_ts", - "updated_ts", - "dbt_updated_at", - "update_datetime", - "event_updated_at", - "last_modified_datetime", -] %} - -{% set joined_timestamp_column_names = "'{}'".format( - "', '".join(timestamp_column_names) -) %} - - -with - columns as ( - select distinct - lower(database_name) as database_name, - lower(schema_name) as schema_name, - lower(table_name) as table_name, - lower(column_name) as column_name - from {{ elementary.get_elementary_relation('information_schema_columns') }} - ), - - -- Inferring the timestamp column based on their names and assigning a confidence score. - inferred_timestamp_columns as ( - select - database_name, - schema_name, - table_name, - column_name, - timestamp_column_names.confidence - from columns - join - ( - values - {% for timestamp_column_name in timestamp_column_names %} - ('{{ timestamp_column_name }}', {{ loop.index }}) - {% if not loop.last %},{% endif %} - {% endfor %} - ) as timestamp_column_names(column_name, confidence) using (column_name) - ), - - -- Users can provide the timestamp columns for their sources, - -- if provided, we assign a confidence score of 0 (certain). - source_provided_timestamp_columns as ( - select - lower(database_name) as database_name, - lower(schema_name) as schema_name, - lower(name) as table_name, - lower(loaded_at_field) as column_name - from {{ ref("elementary", "dbt_sources") }} - where loaded_at_field is not null - ), - - -- Users can provide the timestamp columns for their models, - -- if provided, we assign a confidence score of 0 (certain). - model_provided_timestamp_columns as ( - select - lower(database_name) as database_name, - lower(schema_name) as schema_name, - lower(name) as table_name, - bigquery_partition_by::json ->> 'field' as column_name - from {{ ref("elementary", "dbt_models") }} - where bigquery_partition_by::json ->> 'data_type' != 'int64' - ), - - -- Combining the inferred and source provided timestamp columns. - absolute_rated_timestamp_columns as ( - select - database_name, - schema_name, - table_name, - column_name, - inferred.confidence as absolute_confidence - from inferred_timestamp_columns inferred - union all - select - database_name, - schema_name, - table_name, - column_name, - 0 as absolute_confidence - from source_provided_timestamp_columns - union all - select - database_name, - schema_name, - table_name, - column_name, - 0 as absolute_confidence - from model_provided_timestamp_columns - ), - - -- Sort the timestamp columns by confidence and assign a rank. - relative_rated_timestamp_columns as ( - select - database_name, - schema_name, - table_name, - column_name, - row_number() over ( - partition by database_name, schema_name, table_name - order by absolute_confidence - ) as relative_confidence - from absolute_rated_timestamp_columns - ), - - -- Select the timestamp columns with the highest confidence. - best_rated_timestamp_columns as ( - select database_name, schema_name, table_name, column_name - from relative_rated_timestamp_columns - where relative_confidence = 1 - ) - -select database_name, schema_name, table_name, column_name as timestamp_column -from best_rated_timestamp_columns diff --git a/elementary/monitor/dbt_project/models/tests_recommendation/tables_criticality.sql b/elementary/monitor/dbt_project/models/tests_recommendation/tables_criticality.sql deleted file mode 100644 index 96d73e351..000000000 --- a/elementary/monitor/dbt_project/models/tests_recommendation/tables_criticality.sql +++ /dev/null @@ -1,73 +0,0 @@ -{% set exposures_relation = ref('elementary_cli', 'enriched_exposures') %} -{% if not exposures_relation %} - {% set exposures_relation = ref("elementary", "dbt_exposures") %} -{% endif %} - -with - dbt_models_data as ( - select - unique_id as id, - database_name, - schema_name, - alias as table_name, - name as resource_name, - null as source_name, - 'model' as table_type, - cast(tags as jsonb) as tags, - cast(owner as jsonb) as owner, - cast(depends_on_nodes as jsonb) as depends_on - from {{ ref("elementary", "dbt_models") }} - ), - - dbt_sources_data as ( - select - unique_id as id, - database_name, - schema_name, - name as table_name, - name as resource_name, - source_name, - 'source' as table_type, - cast(tags as jsonb) as tags, - cast(owner as jsonb) as owner, - cast('[]' as jsonb) as depends_on - from {{ ref("elementary", "dbt_sources") }} - ), - - tables_information as ( - select * - from dbt_models_data - union all - select * - from dbt_sources_data - ), - - dependant_on_counts as ( - select t1.id, count(*) as dependant_on_count - from tables_information t1 - join tables_information t2 on t2.depends_on ? t1.id - group by t1.id - ), - - exposure_counts as ( - select t.id, count(*) as exposure_count - from tables_information t - join - {{ exposures_relation }} e - on e.depends_on_nodes::jsonb ? t.id - group by t.id - ), - - tables as ( - select - tables_information.*, - jsonb_array_length(tables_information.depends_on) as depends_on_count, - coalesce(dependant_on_counts.dependant_on_count, 0) as dependant_on_count, - coalesce(exposure_counts.exposure_count, 0) as exposure_count - from tables_information - left join dependant_on_counts on tables_information.id = dependant_on_counts.id - left join exposure_counts on tables_information.id = exposure_counts.id - ) - -select * -from tables diff --git a/elementary/monitor/dbt_project/models/tests_recommendation/test_recommendations.sql b/elementary/monitor/dbt_project/models/tests_recommendation/test_recommendations.sql deleted file mode 100644 index 3007fd991..000000000 --- a/elementary/monitor/dbt_project/models/tests_recommendation/test_recommendations.sql +++ /dev/null @@ -1,105 +0,0 @@ -{# Object structure is [test_namespace, test_name] #} -{% set recommended_tests = [ - ("elementary", "volume_anomalies"), - ("elementary", "freshness_anomalies"), - ("elementary", "schema_changes_from_baseline"), -] %} - -with - tables_criticality as ( - select - id, - lower(database_name) as database_name, - lower(schema_name) as schema_name, - lower(table_name) as table_name, - resource_name, - source_name, - tags, - owner, - depends_on_count, - dependant_on_count, - exposure_count, - table_type - from {{ ref("tables_criticality") }} - ), - - potential_recommended_tests as ( - select id, test_namespace, short_name - from tables_criticality - cross join - ( - {% for recommended_test in recommended_tests %} - select - '{{ recommended_test[0] }}' as test_namespace, - '{{ recommended_test[1] }}' as short_name - {% if not loop.last %} - union all - {% endif %} - {% endfor %} - ) rt - ), - - existing_recommended_tests as ( - select parent_model_unique_id, test_namespace, short_name - from {{ ref("elementary", "dbt_tests") }} - ), - - pending_recommended_tests as ( - select id, test_namespace, short_name - from potential_recommended_tests - where - (id, test_namespace, short_name) not in ( - select parent_model_unique_id, test_namespace, short_name - from existing_recommended_tests - ) - ), - - timestamp_columns as ( - select database_name, schema_name, table_name, timestamp_column - from {{ ref("table_timestamp_columns") }} - ), - - table_columns as ( - select - lower(database_name) as database_name, - lower(schema_name) as schema_name, - lower(table_name) as table_name, - json_agg(json_build_object('name', lower(column_name), 'data_type', lower(data_type))) as columns - from {{ elementary.get_elementary_relation('information_schema_columns') }} - group by 1, 2, 3 - ), - - pending_tests_with_table_info as ( - select - resource_name, - source_name, - test_namespace, - short_name as test_name, - tags, - owner, - depends_on_count, - dependant_on_count, - exposure_count, - table_type, - case - when short_name in ('volume_anomalies', 'freshness_anomalies') and timestamp_column is not null - then jsonb_build_object('timestamp_column', timestamp_column) - end as test_args, - case - when short_name = 'schema_changes_from_baseline' - then jsonb_build_object('columns', table_columns.columns) - end as table_args - from pending_recommended_tests - join tables_criticality using (id) - left join timestamp_columns using (database_name, schema_name, table_name) - left join table_columns using (database_name, schema_name, table_name) - where - short_name = 'volume_anomalies' - or - (short_name = 'freshness_anomalies' and timestamp_column is not null) - or - (short_name = 'schema_changes_from_baseline' and table_columns.columns is not null) - ) - -select * -from pending_tests_with_table_info