feat(alarm): add minSampleCountToEvaluateDatapoint (#453)

miloszwatroba · web-flow · commit 44fbbbb67cb5 · 2023-11-10T16:22:41.000-05:00
Fixes #452 Currently, when using `minMetricSamplesToAlarm` the number of samples is evaluated for a different period than the main alarm. This makes monitoring sensitive to false positives as not every breaching datapoint must have sufficient number of samples (see #452 for more details). Moreover, the current approach for adjusting alarms to respect `minMetricSamplesToAlarm` is to create 2 extra alarms - one for `NoSamples` and one for a top-level composite. Each of these monitors incurs extra costs ($0.10 for `NoSamples` monitor and $0.50 for the Composite, see https://aws.amazon.com/cloudwatch/pricing/ for reference). This means that using `minMetricSamplesToAlarm` increases the cost from $0.10 per alarm to $0.70 per alarm ($0.60 of overhead!). It's possible to use Math Expression instead. Instead of adding separate alarm for `NoSamples`, we can model it a Sample Count metric, and instead of the Composite, we can use the MathExpression that conditionally emits the data based on the number of samples. The charge for Math Expression-based alarms is per metric in the Math Expression, so that comes down to $0.20 per alarm. That's a 70% cost improvement. Additionally, it reduces the overall number of alarms, effectively making it easier to fit your alarming in the CloudWatch quota and decluttering the UI. To avoid breaking any customers that rely on `minMetricSamplesToAlarm` generating alarms (e.g. #403), deprecating it and adding `minSampleCountToEvaluateDatapoint` with updated behaviour next to it. --- _By submitting this pull request, I confirm that my contribution is made under the terms of the Apache-2.0 license_
diff --git a/API.md b/API.md
diff --git a/lib/common/alarm/AlarmFactory.ts b/lib/common/alarm/AlarmFactory.ts
@@ -7,6 +7,7 @@ import {
   CompositeAlarm,
   HorizontalAnnotation,
   IAlarmRule,
+  MathExpression,
   TreatMissingData,
 } from "aws-cdk-lib/aws-cloudwatch";
 import { Construct } from "constructs";
@@ -186,6 +187,18 @@ export interface AddAlarmProps {
    */
   readonly evaluateLowSampleCountPercentile?: boolean;
 
+  /**
+   * Specifies how many samples (N) of the metric is needed in a datapoint to be evaluated for alarming.
+   * If this property is specified, your metric will be subject to MathExpression that will add an IF condition
+   * to your metric to make sure that each datapoint is evaluated only if it has sufficient number of samples.
+   * If the number of samples is not sufficient, the datapoint will be treated as missing data and will be evaluated
+   * according to the treatMissingData parameter.
+   * If specified, deprecated minMetricSamplesToAlarm has no effect.
+   *
+   * @default - default behaviour - no condition on sample count will be used
+   */
+  readonly minSampleCountToEvaluateDatapoint?: number;
+
   /**
    * Specifies how many samples (N) of the metric is needed to trigger the alarm.
    * If this property is specified, an artificial composite alarm is created of the following:
@@ -195,6 +208,9 @@ export interface AddAlarmProps {
    * </ul>
    * The newly created composite alarm will be returned as a result, and it will take the original alarm actions.
    * @default - default behaviour - no condition on sample count will be added to the alarm
+   * @deprecated Use minSampleCountToEvaluateDatapoint instead. minMetricSamplesAlarm uses different evaluation
+   *   period for its child alarms, so it doesn't guarantee that each datapoint in the evaluation period has
+   *   sufficient number of samples
    */
   readonly minMetricSamplesToAlarm?: number;
 
@@ -511,6 +527,9 @@ export class AlarmFactory {
       props
     );
 
+    // metric that will be ultimately used to create the alarm
+    let alarmMetric: MetricWithAlarmSupport = adjustedMetric;
+
     // prepare primary alarm properties
 
     const actionsEnabled = this.determineActionsEnabled(
@@ -549,32 +568,58 @@ export class AlarmFactory {
       );
     }
 
+    // apply metric math for minimum metric samples
+
+    if (props.minSampleCountToEvaluateDatapoint) {
+      if (adjustedMetric instanceof MathExpression) {
+        throw new Error(
+          "minSampleCountToEvaluateDatapoint is not supported for MathExpressions. " +
+            "If you already use MathExpression, you can extend your expression to evaluate " +
+            "the sample count using IF statement, e.g. IF(sampleCount > X, mathExpression)."
+        );
+      }
+
+      const metricSampleCount = adjustedMetric.with({
+        statistic: MetricStatistic.N,
+        label: "Sample count",
+      });
+
+      alarmMetric = new MathExpression({
+        label: `${adjustedMetric}`,
+        expression: `IF(sampleCount > ${props.minSampleCountToEvaluateDatapoint}, metric)`,
+        usingMetrics: {
+          metric: adjustedMetric,
+          sampleCount: metricSampleCount,
+        },
+      });
+    }
+
     // create primary alarm
 
-    const primaryAlarm = adjustedMetric.createAlarm(
-      this.alarmScope,
+    const primaryAlarm = alarmMetric.createAlarm(this.alarmScope, alarmName, {
       alarmName,
-      {
-        alarmName,
-        alarmDescription,
-        threshold: props.threshold,
-        comparisonOperator: props.comparisonOperator,
-        treatMissingData: props.treatMissingData,
-        // default value (undefined) means "evaluate"
-        evaluateLowSampleCountPercentile: evaluateLowSampleCountPercentile
-          ? undefined
-          : "ignore",
-        datapointsToAlarm,
-        evaluationPeriods,
-        actionsEnabled,
-      }
-    );
+      alarmDescription,
+      threshold: props.threshold,
+      comparisonOperator: props.comparisonOperator,
+      treatMissingData: props.treatMissingData,
+      // default value (undefined) means "evaluate"
+      evaluateLowSampleCountPercentile: evaluateLowSampleCountPercentile
+        ? undefined
+        : "ignore",
+      datapointsToAlarm,
+      evaluationPeriods,
+      actionsEnabled,
+    });
 
     let alarm: AlarmBase = primaryAlarm;
 
     // create composite alarm for min metric samples (if defined)
+    // deprecated in favour of minSampleCountToEvaluateDatapoint
 
-    if (props.minMetricSamplesToAlarm) {
+    if (
+      !props.minSampleCountToEvaluateDatapoint &&
+      props.minMetricSamplesToAlarm
+    ) {
       const metricSampleCount = adjustedMetric.with({
         statistic: MetricStatistic.N,
       });
@@ -627,6 +672,8 @@ export class AlarmFactory {
       datapointsToAlarm,
       dedupeString,
       minMetricSamplesToAlarm: props.minMetricSamplesToAlarm,
+      minSampleCountToEvaluateDatapoint:
+        props.minSampleCountToEvaluateDatapoint,
       fillAlarmRange: props.fillAlarmRange ?? false,
       overrideAnnotationColor: props.overrideAnnotationColor,
       overrideAnnotationLabel: props.overrideAnnotationLabel,
diff --git a/lib/common/alarm/IAlarmAnnotationStrategy.ts b/lib/common/alarm/IAlarmAnnotationStrategy.ts
@@ -13,6 +13,7 @@ export interface AlarmAnnotationStrategyProps extends AlarmMetadata {
   readonly metric: MetricWithAlarmSupport;
   readonly comparisonOperator: ComparisonOperator;
   readonly minMetricSamplesToAlarm?: number;
+  readonly minSampleCountToEvaluateDatapoint?: number;
   readonly threshold: number;
   readonly datapointsToAlarm: number;
   readonly evaluationPeriods: number;
diff --git a/test/common/alarm/AlarmFactory.test.ts b/test/common/alarm/AlarmFactory.test.ts
@@ -1,9 +1,10 @@
 import { Duration, Stack } from "aws-cdk-lib";
-import { Capture, Template } from "aws-cdk-lib/assertions";
+import { Capture, Match, Template } from "aws-cdk-lib/assertions";
 import {
   Alarm,
   CfnAlarm,
   ComparisonOperator,
+  MathExpression,
   Metric,
   Shading,
   TreatMissingData,
@@ -330,6 +331,84 @@ test("addAlarm: check created alarms when minMetricSamplesToAlarm is used", () =
   });
 });
 
+test("addAlarm: check created alarms when minSampleCountToEvaluateDatapoint is used", () => {
+  const stack = new Stack();
+  const factory = new AlarmFactory(stack, {
+    globalMetricDefaults,
+    globalAlarmDefaults,
+    localAlarmNamePrefix: "prefix",
+  });
+  factory.addAlarm(metric, {
+    ...props,
+    alarmNameSuffix: "none",
+    comparisonOperator: ComparisonOperator.LESS_THAN_THRESHOLD,
+    minSampleCountToEvaluateDatapoint: 42,
+    minMetricSamplesToAlarm: 55, // not used if minSampleCountToEvaluateDatapoint defined
+  });
+
+  const template = Template.fromStack(stack);
+  template.hasResourceProperties("AWS::CloudWatch::Alarm", {
+    AlarmName: "DummyServiceAlarms-prefix-none",
+    AlarmDescription: "Description",
+    ComparisonOperator: "LessThanThreshold",
+    DatapointsToAlarm: 10,
+    EvaluationPeriods: 10,
+    TreatMissingData: "notBreaching",
+    Metrics: [
+      Match.objectLike({
+        Expression: "IF(sampleCount > 42, metric)",
+        Label: "DummyMetric1",
+      }),
+      {
+        Id: "metric",
+        MetricStat: {
+          Metric: Match.objectLike({
+            MetricName: "DummyMetric1",
+          }),
+          Period: 300,
+          Stat: "Average",
+        },
+        ReturnData: false,
+      },
+      {
+        Id: "sampleCount",
+        MetricStat: {
+          Metric: Match.objectLike({
+            MetricName: "DummyMetric1",
+          }),
+          Period: 300,
+          Stat: "SampleCount",
+        },
+        ReturnData: false,
+      },
+    ],
+  });
+});
+
+test("addAlarm: minSampleCountToEvaluateDatapoint used with Math Expression throws error", () => {
+  const stack = new Stack();
+  const factory = new AlarmFactory(stack, {
+    globalMetricDefaults,
+    globalAlarmDefaults,
+    localAlarmNamePrefix: "prefix",
+  });
+  const mathExpression = new MathExpression({
+    expression: "MAX(metric)",
+    usingMetrics: {
+      metric,
+    },
+  });
+
+  expect(() =>
+    factory.addAlarm(mathExpression, {
+      ...props,
+      minSampleCountToEvaluateDatapoint: 42,
+    })
+  ).toThrow(
+    "minSampleCountToEvaluateDatapoint is not supported for MathExpressions"
+  );
+});
+
 test("addCompositeAlarm: snapshot for operator", () => {
   const stack = new Stack();
   const factory = new AlarmFactory(stack, {