|
17 | 17 |
|
18 | 18 | {%- if test_configuration.seasonality == 'day_of_week' %} |
19 | 19 | {%- set bucket_seasonality_expr = elementary.edr_day_of_week_expression('bucket_end') %} |
| 20 | + {%- set has_seasonality = true %} |
20 | 21 |
|
21 | 22 | {%- elif test_configuration.seasonality == 'hour_of_day' %} |
22 | 23 | {%- set bucket_seasonality_expr = elementary.edr_hour_of_day_expression('bucket_end') %} |
| 24 | + {%- set has_seasonality = true %} |
23 | 25 |
|
24 | 26 | {%- elif test_configuration.seasonality == 'hour_of_week' %} |
25 | 27 | {%- set bucket_seasonality_expr = elementary.edr_hour_of_week_expression('bucket_end') %} |
| 28 | + {%- set has_seasonality = true %} |
26 | 29 |
|
27 | 30 | {%- else %} |
28 | 31 | {%- set bucket_seasonality_expr = elementary.const_as_text('no_seasonality') %} |
| 32 | + {%- set has_seasonality = false %} |
29 | 33 | {%- endif %} |
| 34 | + |
| 35 | + {# Build PARTITION BY clause for window functions dynamically to work around Redshift limitation. |
| 36 | + |
| 37 | + Redshift doesn't allow constant expressions in PARTITION BY of window functions. When seasonality |
| 38 | + is not configured, bucket_seasonality becomes a constant ('no_seasonality'::text), which triggers |
| 39 | + the error "constant expressions are not supported in partition by clauses." |
| 40 | + |
| 41 | + We build the partition keys dynamically, always including the core metric keys and only appending |
| 42 | + bucket_seasonality when it's computed from timestamps (has_seasonality = true). Partitioning by |
| 43 | + a constant has no effect anyway, so this preserves behavior while keeping Redshift happy. #} |
| 44 | + {%- set partition_by_keys = "metric_name, full_table_name, column_name, dimension, dimension_value" %} |
| 45 | + {%- if has_seasonality %} |
| 46 | + {%- set partition_by_keys = partition_by_keys ~ ", bucket_seasonality" %} |
| 47 | + {%- endif %} |
| 48 | + |
30 | 49 | {%- set detection_end = elementary.get_detection_end(test_configuration.detection_delay) %} |
31 | 50 | {%- set detection_end_expr = elementary.edr_cast_as_timestamp(elementary.edr_datetime_to_sql(detection_end)) %} |
32 | 51 | {%- set min_bucket_start_expr = elementary.get_trunc_min_bucket_start_expr(detection_end, metric_properties, test_configuration.days_back) %} |
33 | 52 |
|
| 53 | + {# Calculate detection period start for exclusion logic. |
| 54 | + backfill_days defines the window of recent data to test for anomalies on each run. |
| 55 | + It defaults to 2 days (configurable via vars.backfill_days or test-level parameter). |
| 56 | + The detection period spans from (detection_end - backfill_days) to detection_end. |
| 57 | + When exclude_detection_period_from_training is enabled, metrics in this detection period |
| 58 | + are excluded from training statistics to prevent contamination from potentially anomalous data. #} |
| 59 | + {%- if test_configuration.exclude_detection_period_from_training %} |
| 60 | + {%- set detection_period_start = (detection_end - modules.datetime.timedelta(days=test_configuration.backfill_days)) %} |
| 61 | + {%- set detection_period_start_expr = elementary.edr_cast_as_timestamp(elementary.edr_datetime_to_sql(detection_period_start)) %} |
| 62 | + {%- endif %} |
| 63 | + |
34 | 64 | {# For timestamped tests, this will be the bucket start, and for non-timestamped tests it will be the |
35 | 65 | bucket end (which is the actual time of the test) #} |
36 | 66 | {%- set metric_time_bucket_expr = 'case when bucket_start is not null then bucket_start else bucket_end end' %} |
|
142 | 172 | bucket_end, |
143 | 173 | {{ bucket_seasonality_expr }} as bucket_seasonality, |
144 | 174 | {{ test_configuration.anomaly_exclude_metrics or 'FALSE' }} as is_excluded, |
| 175 | + {# Flag detection period metrics for exclusion from training #} |
| 176 | + {% if test_configuration.exclude_detection_period_from_training %} |
| 177 | + bucket_end > {{ detection_period_start_expr }} |
| 178 | + {% else %} |
| 179 | + FALSE |
| 180 | + {% endif %} as should_exclude_from_training, |
145 | 181 | bucket_duration_hours, |
146 | 182 | updated_at |
147 | 183 | from grouped_metrics_duplicates |
|
164 | 200 | bucket_seasonality, |
165 | 201 | bucket_duration_hours, |
166 | 202 | updated_at, |
167 | | - avg(metric_value) over (partition by metric_name, full_table_name, column_name, dimension, dimension_value, bucket_seasonality order by bucket_end asc rows between unbounded preceding and current row) as training_avg, |
168 | | - {{ elementary.standard_deviation('metric_value') }} over (partition by metric_name, full_table_name, column_name, dimension, dimension_value, bucket_seasonality order by bucket_end asc rows between unbounded preceding and current row) as training_stddev, |
169 | | - count(metric_value) over (partition by metric_name, full_table_name, column_name, dimension, dimension_value, bucket_seasonality order by bucket_end asc rows between unbounded preceding and current row) as training_set_size, |
170 | | - last_value(bucket_end) over (partition by metric_name, full_table_name, column_name, dimension, dimension_value, bucket_seasonality order by bucket_end asc rows between unbounded preceding and current row) training_end, |
171 | | - first_value(bucket_end) over (partition by metric_name, full_table_name, column_name, dimension, dimension_value, bucket_seasonality order by bucket_end asc rows between unbounded preceding and current row) as training_start |
| 203 | + should_exclude_from_training, |
| 204 | + avg(case when not should_exclude_from_training then metric_value end) over (partition by {{ partition_by_keys }} order by bucket_end asc rows between unbounded preceding and current row) as training_avg, |
| 205 | + {{ elementary.standard_deviation('case when not should_exclude_from_training then metric_value end') }} over (partition by {{ partition_by_keys }} order by bucket_end asc rows between unbounded preceding and current row) as training_stddev, |
| 206 | + count(case when not should_exclude_from_training then metric_value end) over (partition by {{ partition_by_keys }} order by bucket_end asc rows between unbounded preceding and current row) as training_set_size, |
| 207 | + last_value(case when not should_exclude_from_training then bucket_end end) over (partition by {{ partition_by_keys }} order by bucket_end asc rows between unbounded preceding and current row) training_end, |
| 208 | + first_value(case when not should_exclude_from_training then bucket_end end) over (partition by {{ partition_by_keys }} order by bucket_end asc rows between unbounded preceding and current row) as training_start |
172 | 209 | from grouped_metrics |
173 | 210 | where not is_excluded |
174 | | - {{ dbt_utils.group_by(13) }} |
| 211 | + {{ dbt_utils.group_by(14) }} |
175 | 212 | ), |
176 | 213 |
|
177 | 214 | anomaly_scores as ( |
|
0 commit comments