|
31 | 31 | {%- set detection_end_expr = elementary.edr_cast_as_timestamp(elementary.edr_datetime_to_sql(detection_end)) %} |
32 | 32 | {%- set min_bucket_start_expr = elementary.get_trunc_min_bucket_start_expr(detection_end, metric_properties, test_configuration.days_back) %} |
33 | 33 |
|
34 | | - {# Calculate detection period start for exclusion logic #} |
| 34 | + {# Calculate detection period start for exclusion logic. |
| 35 | + The detection period spans from (detection_end - backfill_days) to detection_end. |
| 36 | + This ensures we exclude the most recent backfill_days worth of data from training, |
| 37 | + which are the metrics being actively tested for anomalies. #} |
35 | 38 | {%- if test_configuration.exclude_detection_period_from_training %} |
36 | 39 | {%- set detection_period_start = (detection_end - modules.datetime.timedelta(days=test_configuration.backfill_days)) %} |
37 | 40 | {%- set detection_period_start_expr = elementary.edr_cast_as_timestamp(elementary.edr_datetime_to_sql(detection_period_start)) %} |
|
153 | 156 | bucket_end > {{ detection_period_start_expr }} |
154 | 157 | {% else %} |
155 | 158 | FALSE |
156 | | - {% endif %} as is_detection_period, |
| 159 | + {% endif %} as should_exclude_from_training, |
157 | 160 | bucket_duration_hours, |
158 | 161 | updated_at |
159 | 162 | from grouped_metrics_duplicates |
|
176 | 179 | bucket_seasonality, |
177 | 180 | bucket_duration_hours, |
178 | 181 | updated_at, |
179 | | - is_detection_period, |
180 | | - avg(case when not is_detection_period then metric_value end) over (partition by metric_name, full_table_name, column_name, dimension, dimension_value, bucket_seasonality order by bucket_end asc rows between unbounded preceding and current row) as training_avg, |
181 | | - {{ elementary.standard_deviation('case when not is_detection_period then metric_value end') }} over (partition by metric_name, full_table_name, column_name, dimension, dimension_value, bucket_seasonality order by bucket_end asc rows between unbounded preceding and current row) as training_stddev, |
182 | | - count(case when not is_detection_period then metric_value end) over (partition by metric_name, full_table_name, column_name, dimension, dimension_value, bucket_seasonality order by bucket_end asc rows between unbounded preceding and current row) as training_set_size, |
183 | | - last_value(case when not is_detection_period then bucket_end end) over (partition by metric_name, full_table_name, column_name, dimension, dimension_value, bucket_seasonality order by bucket_end asc rows between unbounded preceding and current row) training_end, |
184 | | - first_value(case when not is_detection_period then bucket_end end) over (partition by metric_name, full_table_name, column_name, dimension, dimension_value, bucket_seasonality order by bucket_end asc rows between unbounded preceding and current row) as training_start |
| 182 | + should_exclude_from_training, |
| 183 | + avg(case when not should_exclude_from_training then metric_value end) over (partition by metric_name, full_table_name, column_name, dimension, dimension_value, bucket_seasonality order by bucket_end asc rows between unbounded preceding and current row) as training_avg, |
| 184 | + {{ elementary.standard_deviation('case when not should_exclude_from_training then metric_value end') }} over (partition by metric_name, full_table_name, column_name, dimension, dimension_value, bucket_seasonality order by bucket_end asc rows between unbounded preceding and current row) as training_stddev, |
| 185 | + count(case when not should_exclude_from_training then metric_value end) over (partition by metric_name, full_table_name, column_name, dimension, dimension_value, bucket_seasonality order by bucket_end asc rows between unbounded preceding and current row) as training_set_size, |
| 186 | + last_value(case when not should_exclude_from_training then bucket_end end) over (partition by metric_name, full_table_name, column_name, dimension, dimension_value, bucket_seasonality order by bucket_end asc rows between unbounded preceding and current row) training_end, |
| 187 | + first_value(case when not should_exclude_from_training then bucket_end end) over (partition by metric_name, full_table_name, column_name, dimension, dimension_value, bucket_seasonality order by bucket_end asc rows between unbounded preceding and current row) as training_start |
185 | 188 | from grouped_metrics |
186 | 189 | where not is_excluded |
187 | 190 | {{ dbt_utils.group_by(14) }} |
|
0 commit comments