Skip to content

Commit 696804e

Browse files
feat(kinesisdataanalytics): Fix Checkpoint Failure Rate calculation and add Full Restart Rate monitoring (#668)
### Changes **Fixed Checkpoint Failure Rate** - Replaced manual rate calculation with CloudWatch's native RATE() function - Changed from (3600 * checkpoints) / PERIOD(checkpoints) to RATE(numberOfFailedCheckpoints) **Added Full Restart Rate Monitoring** - New metricFullRestartRate() method using RATE(fullRestarts) - New addFullRestartRateAlarm() with custom implementation to avoid naming conflicts - Enhanced Full Restarts widget to show both count (left axis) and rate (right axis) **Technical** - Resolved alarm naming conflicts between full restart and checkpoint failure rate alarms - Added test coverage for new functionality - All 5 alarms now supported (was 4) Provides more accurate rate monitoring and better insights into Kinesis Data Analytics application performance.
1 parent 4254fbf commit 696804e

File tree

7 files changed

+213
-20
lines changed

7 files changed

+213
-20
lines changed

API.md

Lines changed: 76 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

lib/common/monitoring/alarms/KinesisDataAnalyticsAlarmFactory.ts

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,26 @@ export class KinesisDataAnalyticsAlarmFactory {
7070
});
7171
}
7272

73+
addFullRestartRateAlarm(
74+
metric: MetricWithAlarmSupport,
75+
props: ErrorRateThreshold,
76+
disambiguator?: string,
77+
) {
78+
return this.alarmFactory.addAlarm(metric, {
79+
treatMissingData:
80+
props.treatMissingDataOverride ?? TreatMissingData.BREACHING,
81+
comparisonOperator:
82+
props.comparisonOperatorOverride ??
83+
ComparisonOperator.GREATER_THAN_THRESHOLD,
84+
...props,
85+
disambiguator,
86+
threshold: props.maxErrorRate,
87+
alarmNameSuffix: "FullRestartRate",
88+
alarmDescription: "Full restart rate is too high",
89+
alarmDedupeStringSuffix: "KDAFullRestartRateAlarm",
90+
});
91+
}
92+
7393
addCheckpointFailureCountAlarm(
7494
metric: MetricWithAlarmSupport,
7595
props: ErrorCountThreshold,

lib/monitoring/aws-kinesisanalytics/KinesisDataAnalyticsMetricFactory.ts

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ import {
55
BaseMetricFactoryProps,
66
MetricFactory,
77
MetricStatistic,
8-
RateComputationMethod,
98
} from "../../common";
109

1110
export interface KinesisDataAnalyticsMetricFactoryProps
@@ -120,11 +119,34 @@ export class KinesisDataAnalyticsMetricFactory extends BaseMetricFactory<Kinesis
120119
}
121120

122121
metricCheckpointFailureRate() {
123-
return this.metricFactory.toRate(
124-
this.metricNumberOfFailedCheckpointsCount(),
125-
RateComputationMethod.PER_HOUR,
126-
false,
127-
"checkpoints",
122+
// Flink reports this metric as the latest sum for the lifecycle of a job.
123+
// Therefore, we truly care about rate of change
124+
return this.metricFactory.createMetricMath(
125+
"RATE(numberOfFailedCheckpoints)",
126+
{
127+
numberOfFailedCheckpoints: this.metricNumberOfFailedCheckpointsCount(),
128+
},
129+
"Checkpoint Failure Rate",
130+
undefined,
131+
undefined,
132+
this.region,
133+
this.account,
134+
);
135+
}
136+
137+
metricFullRestartRate() {
138+
// Flink reports this metric as the latest sum for the lifecycle of a job.
139+
// Therefore, we truly care about rate of change
140+
return this.metricFactory.createMetricMath(
141+
"RATE(fullRestarts)",
142+
{
143+
fullRestarts: this.metricFullRestartsCount(),
144+
},
145+
"Full Restart Rate",
146+
undefined,
147+
undefined,
148+
this.region,
149+
this.account,
128150
);
129151
}
130152

lib/monitoring/aws-kinesisanalytics/KinesisDataAnalyticsMonitoring.ts

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ export interface KinesisDataAnalyticsMonitoringOptions
3838

3939
readonly addFullRestartCountAlarm?: Record<string, FullRestartCountThreshold>;
4040

41+
readonly addFullRestartRateAlarm?: Record<string, ErrorRateThreshold>;
42+
4143
readonly addCheckpointFailureCountAlarm?: Record<string, ErrorCountThreshold>;
4244

4345
readonly addCheckpointFailureRateAlarm?: Record<string, ErrorRateThreshold>;
@@ -54,6 +56,7 @@ export class KinesisDataAnalyticsMonitoring extends Monitoring {
5456
readonly kdaAlarmFactory: KinesisDataAnalyticsAlarmFactory;
5557
readonly downtimeAnnotations: HorizontalAnnotation[];
5658
readonly fullRestartAnnotations: HorizontalAnnotation[];
59+
readonly fullRestartRateAnnotations: HorizontalAnnotation[];
5760
readonly checkpointFailureCountAnnotations: HorizontalAnnotation[];
5861
readonly checkpointFailureRateAnnotations: HorizontalAnnotation[];
5962

@@ -68,6 +71,7 @@ export class KinesisDataAnalyticsMonitoring extends Monitoring {
6871
readonly oldGenerationGCCountMetric: MetricWithAlarmSupport;
6972
readonly oldGenerationGCTimeMsMetric: MetricWithAlarmSupport;
7073
readonly checkpointFailureRateMetric: MetricWithAlarmSupport;
74+
readonly fullRestartRateMetric: MetricWithAlarmSupport;
7175

7276
constructor(
7377
scope: MonitoringScope,
@@ -90,6 +94,7 @@ export class KinesisDataAnalyticsMonitoring extends Monitoring {
9094
this.kdaAlarmFactory = new KinesisDataAnalyticsAlarmFactory(alarmFactory);
9195
this.downtimeAnnotations = [];
9296
this.fullRestartAnnotations = [];
97+
this.fullRestartRateAnnotations = [];
9398
this.checkpointFailureCountAnnotations = [];
9499
this.checkpointFailureRateAnnotations = [];
95100

@@ -117,6 +122,7 @@ export class KinesisDataAnalyticsMonitoring extends Monitoring {
117122
metricFactory.metricOldGenerationGCTimeMs();
118123
this.checkpointFailureRateMetric =
119124
metricFactory.metricCheckpointFailureRate();
125+
this.fullRestartRateMetric = metricFactory.metricFullRestartRate();
120126

121127
for (const disambiguator in props.addDowntimeAlarm) {
122128
const alarmProps = props.addDowntimeAlarm[disambiguator];
@@ -140,6 +146,17 @@ export class KinesisDataAnalyticsMonitoring extends Monitoring {
140146
this.addAlarm(createdAlarm);
141147
}
142148

149+
for (const disambiguator in props.addFullRestartRateAlarm) {
150+
const alarmProps = props.addFullRestartRateAlarm[disambiguator];
151+
const createdAlarm = this.kdaAlarmFactory.addFullRestartRateAlarm(
152+
this.fullRestartRateMetric,
153+
alarmProps,
154+
disambiguator,
155+
);
156+
this.fullRestartRateAnnotations.push(createdAlarm.annotation);
157+
this.addAlarm(createdAlarm);
158+
}
159+
143160
for (const disambiguator in props.addCheckpointFailureCountAlarm) {
144161
const alarmProps = props.addCheckpointFailureCountAlarm[disambiguator];
145162
const createdAlarm = this.kdaAlarmFactory.addCheckpointFailureCountAlarm(
@@ -230,6 +247,9 @@ export class KinesisDataAnalyticsMonitoring extends Monitoring {
230247
left: [this.fullRestartsCountMetric],
231248
leftYAxis: CountAxisFromZero,
232249
leftAnnotations: this.fullRestartAnnotations,
250+
right: [this.fullRestartRateMetric],
251+
rightYAxis: RateAxisFromZero,
252+
rightAnnotations: this.fullRestartRateAnnotations,
233253
});
234254
}
235255

test/facade/__snapshots__/MonitoringAspect.test.ts.snap

Lines changed: 3 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

test/monitoring/aws-kinesisanalytics/KinesisDataAnalyticsMonitoring.test.ts

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,11 @@ test("snapshot test: all alarms", () => {
3737
maxFullRestartCount: 1,
3838
},
3939
},
40+
addFullRestartRateAlarm: {
41+
Warning: {
42+
maxErrorRate: 0.1,
43+
},
44+
},
4045
addCheckpointFailureCountAlarm: {
4146
Warning: {
4247
maxErrorCount: 5,
@@ -55,6 +60,6 @@ test("snapshot test: all alarms", () => {
5560
});
5661

5762
addMonitoringDashboardsToStack(stack, monitoring);
58-
expect(numAlarmsCreated).toStrictEqual(4);
63+
expect(numAlarmsCreated).toStrictEqual(5);
5964
expect(Template.fromStack(stack)).toMatchSnapshot();
6065
});

0 commit comments

Comments
 (0)