Skip to content

Commit 63bb22f

Browse files
feat(ecs): add p100 CPU and memory utilization metrics and alarms (#715)
Add p100 statistic alongside existing average for CPU and memory utilization widgets in Fargate and Ec2 service monitoring, enabling operators to monitor peak resource usage. Fixes #714 --- _By submitting this pull request, I confirm that my contribution is made under the terms of the Apache-2.0 license_ Co-authored-by: Colton White <coltonawhite@gmail.com>
1 parent 2d07670 commit 63bb22f

File tree

10 files changed

+4908
-456
lines changed

10 files changed

+4908
-456
lines changed

API.md

Lines changed: 322 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

lib/monitoring/aws-ecs-patterns/BaseServiceMetricFactory.ts

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,13 @@ export class BaseServiceMetricFactory extends BaseMetricFactory<BaseServiceMetri
4141
this.service = props.service;
4242
}
4343

44-
metricClusterCpuUtilisationInPercent() {
44+
metricClusterCpuUtilisationInPercent(
45+
statistic: MetricStatistic = MetricStatistic.AVERAGE,
46+
) {
4547
return this.metricFactory.createMetric(
4648
"CPUUtilization",
47-
MetricStatistic.AVERAGE,
48-
"Cluster CPU Utilization",
49+
statistic,
50+
`Cluster CPU Utilization ${statistic}`,
4951
this.dimensionsMap,
5052
undefined,
5153
EcsNamespace,
@@ -55,11 +57,13 @@ export class BaseServiceMetricFactory extends BaseMetricFactory<BaseServiceMetri
5557
);
5658
}
5759

58-
metricClusterMemoryUtilisationInPercent() {
60+
metricClusterMemoryUtilisationInPercent(
61+
statistic: MetricStatistic = MetricStatistic.AVERAGE,
62+
) {
5963
return this.metricFactory.createMetric(
6064
"MemoryUtilization",
61-
MetricStatistic.AVERAGE,
62-
"Cluster Memory Utilization",
65+
statistic,
66+
`Cluster Memory Utilization ${statistic}`,
6367
this.dimensionsMap,
6468
undefined,
6569
EcsNamespace,

lib/monitoring/aws-ecs-patterns/Ec2ServiceMonitoring.ts

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import {
2525
HealthyTaskCountThreshold,
2626
HealthyTaskPercentThreshold,
2727
MetricFactory,
28+
MetricStatistic,
2829
MetricWithAlarmSupport,
2930
MinProcessedBytesThreshold,
3031
Monitoring,
@@ -40,6 +41,7 @@ import {
4041
UnhealthyTaskCountThreshold,
4142
UsageAlarmFactory,
4243
UsageThreshold,
44+
UsageType,
4345
} from "../../common";
4446
import {
4547
MonitoringHeaderWidget,
@@ -63,6 +65,8 @@ export interface BaseEc2ServiceAlarms {
6365
readonly maxAutoScalingTaskCount?: number;
6466
readonly addCpuUsageAlarm?: Record<string, UsageThreshold>;
6567
readonly addMemoryUsageAlarm?: Record<string, UsageThreshold>;
68+
readonly addCpuP100UsageAlarm?: Record<string, UsageThreshold>;
69+
readonly addMemoryP100UsageAlarm?: Record<string, UsageThreshold>;
6670

6771
/**
6872
* Container Insights needs to be enabled for the cluster for this alarm.
@@ -176,7 +180,9 @@ export class Ec2ServiceMonitoring extends Monitoring {
176180
readonly unhealthyTaskCountMetric?: MetricWithAlarmSupport;
177181
readonly healthyTaskPercentMetric?: MetricWithAlarmSupport;
178182
readonly cpuUtilisationMetric: MetricWithAlarmSupport;
183+
readonly cpuP100UtilisationMetric: MetricWithAlarmSupport;
179184
readonly memoryUtilisationMetric: MetricWithAlarmSupport;
185+
readonly memoryP100UtilisationMetric: MetricWithAlarmSupport;
180186
readonly runningTaskCountMetric: MetricWithAlarmSupport;
181187
readonly ephemeralStorageUsageMetric: MetricWithAlarmSupport;
182188
readonly activeTcpFlowCountMetric?: MetricWithAlarmSupport;
@@ -227,8 +233,16 @@ export class Ec2ServiceMonitoring extends Monitoring {
227233
}
228234
this.cpuUtilisationMetric =
229235
this.baseServiceMetricFactory.metricClusterCpuUtilisationInPercent();
236+
this.cpuP100UtilisationMetric =
237+
this.baseServiceMetricFactory.metricClusterCpuUtilisationInPercent(
238+
MetricStatistic.P100,
239+
);
230240
this.memoryUtilisationMetric =
231241
this.baseServiceMetricFactory.metricClusterMemoryUtilisationInPercent();
242+
this.memoryP100UtilisationMetric =
243+
this.baseServiceMetricFactory.metricClusterMemoryUtilisationInPercent(
244+
MetricStatistic.P100,
245+
);
232246
this.runningTaskCountMetric =
233247
this.baseServiceMetricFactory.metricRunningTaskCount();
234248
this.ephemeralStorageUsageMetric =
@@ -309,6 +323,17 @@ export class Ec2ServiceMonitoring extends Monitoring {
309323
this.cpuUsageAnnotations.push(createdAlarm.annotation);
310324
this.addAlarm(createdAlarm);
311325
}
326+
for (const disambiguator in props.addCpuP100UsageAlarm) {
327+
const alarmProps = props.addCpuP100UsageAlarm[disambiguator];
328+
const createdAlarm = this.usageAlarmFactory.addMaxCpuUsagePercentAlarm(
329+
this.cpuP100UtilisationMetric,
330+
alarmProps,
331+
disambiguator,
332+
UsageType.P100,
333+
);
334+
this.cpuUsageAnnotations.push(createdAlarm.annotation);
335+
this.addAlarm(createdAlarm);
336+
}
312337
for (const disambiguator in props.addMemoryUsageAlarm) {
313338
const alarmProps = props.addMemoryUsageAlarm[disambiguator];
314339
const createdAlarm = this.usageAlarmFactory.addMaxMemoryUsagePercentAlarm(
@@ -319,6 +344,17 @@ export class Ec2ServiceMonitoring extends Monitoring {
319344
this.memoryUsageAnnotations.push(createdAlarm.annotation);
320345
this.addAlarm(createdAlarm);
321346
}
347+
for (const disambiguator in props.addMemoryP100UsageAlarm) {
348+
const alarmProps = props.addMemoryP100UsageAlarm[disambiguator];
349+
const createdAlarm = this.usageAlarmFactory.addMemoryUsagePercentAlarm(
350+
this.memoryP100UtilisationMetric,
351+
alarmProps,
352+
UsageType.P100,
353+
disambiguator,
354+
);
355+
this.memoryUsageAnnotations.push(createdAlarm.annotation);
356+
this.addAlarm(createdAlarm);
357+
}
322358

323359
for (const disambiguator in props.addRunningTaskCountAlarm) {
324360
const alarmProps = props.addRunningTaskCountAlarm[disambiguator];
@@ -404,7 +440,7 @@ export class Ec2ServiceMonitoring extends Monitoring {
404440
width,
405441
height,
406442
title: "CPU Utilization",
407-
left: [this.cpuUtilisationMetric],
443+
left: [this.cpuUtilisationMetric, this.cpuP100UtilisationMetric],
408444
leftYAxis: PercentageAxisFromZeroToHundred,
409445
leftAnnotations: this.cpuUsageAnnotations,
410446
});
@@ -415,7 +451,7 @@ export class Ec2ServiceMonitoring extends Monitoring {
415451
width,
416452
height,
417453
title: "Memory Utilization",
418-
left: [this.memoryUtilisationMetric],
454+
left: [this.memoryUtilisationMetric, this.memoryP100UtilisationMetric],
419455
leftYAxis: PercentageAxisFromZeroToHundred,
420456
leftAnnotations: this.memoryUsageAnnotations,
421457
});

lib/monitoring/aws-ecs-patterns/FargateServiceMonitoring.ts

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import {
2525
HealthyTaskCountThreshold,
2626
HealthyTaskPercentThreshold,
2727
MetricFactory,
28+
MetricStatistic,
2829
MetricWithAlarmSupport,
2930
MinProcessedBytesThreshold,
3031
Monitoring,
@@ -40,6 +41,7 @@ import {
4041
UnhealthyTaskCountThreshold,
4142
UsageAlarmFactory,
4243
UsageThreshold,
44+
UsageType,
4345
} from "../../common";
4446
import {
4547
MonitoringHeaderWidget,
@@ -63,6 +65,8 @@ export interface BaseFargateServiceAlarms {
6365
readonly maxAutoScalingTaskCount?: number;
6466
readonly addCpuUsageAlarm?: Record<string, UsageThreshold>;
6567
readonly addMemoryUsageAlarm?: Record<string, UsageThreshold>;
68+
readonly addCpuP100UsageAlarm?: Record<string, UsageThreshold>;
69+
readonly addMemoryP100UsageAlarm?: Record<string, UsageThreshold>;
6670

6771
/**
6872
* Container Insights needs to be enabled for the cluster for this alarm.
@@ -176,7 +180,9 @@ export class FargateServiceMonitoring extends Monitoring {
176180
readonly unhealthyTaskCountMetric?: MetricWithAlarmSupport;
177181
readonly healthyTaskPercentMetric?: MetricWithAlarmSupport;
178182
readonly cpuUtilisationMetric: MetricWithAlarmSupport;
183+
readonly cpuP100UtilisationMetric: MetricWithAlarmSupport;
179184
readonly memoryUtilisationMetric: MetricWithAlarmSupport;
185+
readonly memoryP100UtilisationMetric: MetricWithAlarmSupport;
180186
readonly runningTaskCountMetric: MetricWithAlarmSupport;
181187
readonly ephemeralStorageUsageMetric: MetricWithAlarmSupport;
182188
readonly activeTcpFlowCountMetric?: MetricWithAlarmSupport;
@@ -230,8 +236,16 @@ export class FargateServiceMonitoring extends Monitoring {
230236
}
231237
this.cpuUtilisationMetric =
232238
this.baseServiceMetricFactory.metricClusterCpuUtilisationInPercent();
239+
this.cpuP100UtilisationMetric =
240+
this.baseServiceMetricFactory.metricClusterCpuUtilisationInPercent(
241+
MetricStatistic.P100,
242+
);
233243
this.memoryUtilisationMetric =
234244
this.baseServiceMetricFactory.metricClusterMemoryUtilisationInPercent();
245+
this.memoryP100UtilisationMetric =
246+
this.baseServiceMetricFactory.metricClusterMemoryUtilisationInPercent(
247+
MetricStatistic.P100,
248+
);
235249
this.runningTaskCountMetric =
236250
this.baseServiceMetricFactory.metricRunningTaskCount();
237251
this.ephemeralStorageUsageMetric =
@@ -313,6 +327,17 @@ export class FargateServiceMonitoring extends Monitoring {
313327
this.cpuUsageAnnotations.push(createdAlarm.annotation);
314328
this.addAlarm(createdAlarm);
315329
}
330+
for (const disambiguator in props.addCpuP100UsageAlarm) {
331+
const alarmProps = props.addCpuP100UsageAlarm[disambiguator];
332+
const createdAlarm = this.usageAlarmFactory.addMaxCpuUsagePercentAlarm(
333+
this.cpuP100UtilisationMetric,
334+
alarmProps,
335+
disambiguator,
336+
UsageType.P100,
337+
);
338+
this.cpuUsageAnnotations.push(createdAlarm.annotation);
339+
this.addAlarm(createdAlarm);
340+
}
316341
for (const disambiguator in props.addMemoryUsageAlarm) {
317342
const alarmProps = props.addMemoryUsageAlarm[disambiguator];
318343
const createdAlarm = this.usageAlarmFactory.addMaxMemoryUsagePercentAlarm(
@@ -323,6 +348,17 @@ export class FargateServiceMonitoring extends Monitoring {
323348
this.memoryUsageAnnotations.push(createdAlarm.annotation);
324349
this.addAlarm(createdAlarm);
325350
}
351+
for (const disambiguator in props.addMemoryP100UsageAlarm) {
352+
const alarmProps = props.addMemoryP100UsageAlarm[disambiguator];
353+
const createdAlarm = this.usageAlarmFactory.addMemoryUsagePercentAlarm(
354+
this.memoryP100UtilisationMetric,
355+
alarmProps,
356+
UsageType.P100,
357+
disambiguator,
358+
);
359+
this.memoryUsageAnnotations.push(createdAlarm.annotation);
360+
this.addAlarm(createdAlarm);
361+
}
326362

327363
for (const disambiguator in props.addRunningTaskCountAlarm) {
328364
const alarmProps = props.addRunningTaskCountAlarm[disambiguator];
@@ -408,7 +444,7 @@ export class FargateServiceMonitoring extends Monitoring {
408444
width,
409445
height,
410446
title: "CPU Utilization",
411-
left: [this.cpuUtilisationMetric],
447+
left: [this.cpuUtilisationMetric, this.cpuP100UtilisationMetric],
412448
leftYAxis: PercentageAxisFromZeroToHundred,
413449
leftAnnotations: this.cpuUsageAnnotations,
414450
});
@@ -419,7 +455,7 @@ export class FargateServiceMonitoring extends Monitoring {
419455
width,
420456
height,
421457
title: "Memory Utilization",
422-
left: [this.memoryUtilisationMetric],
458+
left: [this.memoryUtilisationMetric, this.memoryP100UtilisationMetric],
423459
leftYAxis: PercentageAxisFromZeroToHundred,
424460
leftAnnotations: this.memoryUsageAnnotations,
425461
});

test/monitoring/aws-ecs-patterns/Ec2ServiceMonitoring.test.ts

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,16 @@ import { TestMonitoringScope } from "../TestMonitoringScope";
109109
maxUsagePercent: 80,
110110
},
111111
},
112+
addCpuP100UsageAlarm: {
113+
Warning: {
114+
maxUsagePercent: 90,
115+
},
116+
},
117+
addMemoryP100UsageAlarm: {
118+
Warning: {
119+
maxUsagePercent: 90,
120+
},
121+
},
112122
addRunningTaskCountAlarm: {
113123
Warning: {
114124
maxRunningTasks: 5,
@@ -134,7 +144,7 @@ import { TestMonitoringScope } from "../TestMonitoringScope";
134144
});
135145

136146
addMonitoringDashboardsToStack(stack, monitoring);
137-
expect(numAlarmsCreated).toStrictEqual(8);
147+
expect(numAlarmsCreated).toStrictEqual(10);
138148
expect(Template.fromStack(stack)).toMatchSnapshot();
139149
});
140150

@@ -241,6 +251,16 @@ import { TestMonitoringScope } from "../TestMonitoringScope";
241251
maxUsagePercent: 80,
242252
},
243253
},
254+
addCpuP100UsageAlarm: {
255+
Warning: {
256+
maxUsagePercent: 90,
257+
},
258+
},
259+
addMemoryP100UsageAlarm: {
260+
Warning: {
261+
maxUsagePercent: 90,
262+
},
263+
},
244264
addRunningTaskCountAlarm: {
245265
Warning: {
246266
maxRunningTasks: 5,
@@ -266,7 +286,7 @@ import { TestMonitoringScope } from "../TestMonitoringScope";
266286
});
267287

268288
addMonitoringDashboardsToStack(stack, monitoring);
269-
expect(numAlarmsCreated).toStrictEqual(8);
289+
expect(numAlarmsCreated).toStrictEqual(10);
270290
expect(Template.fromStack(stack)).toMatchSnapshot();
271291
});
272292
},

0 commit comments

Comments
 (0)