Skip to content

Commit 6d5badf

Browse files
vohomergify[bot]
andauthored
feat(glue): add alarms for failed/killed tasks (#100)
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
1 parent fecff60 commit 6d5badf

File tree

7 files changed

+742
-30
lines changed

7 files changed

+742
-30
lines changed

API.md

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15507,6 +15507,10 @@ const glueJobMonitoringOptions: GlueJobMonitoringOptions = { ... }
1550715507
| <code><a href="#cdk-monitoring-constructs.GlueJobMonitoringOptions.property.addToDetailDashboard">addToDetailDashboard</a></code> | <code>boolean</code> | Flag indicating if the widgets should be added to detailed dashboard. |
1550815508
| <code><a href="#cdk-monitoring-constructs.GlueJobMonitoringOptions.property.addToSummaryDashboard">addToSummaryDashboard</a></code> | <code>boolean</code> | Flag indicating if the widgets should be added to summary dashboard. |
1550915509
| <code><a href="#cdk-monitoring-constructs.GlueJobMonitoringOptions.property.useCreatedAlarms">useCreatedAlarms</a></code> | <code><a href="#cdk-monitoring-constructs.IAlarmConsumer">IAlarmConsumer</a></code> | Calls provided function to process all alarms created. |
15510+
| <code><a href="#cdk-monitoring-constructs.GlueJobMonitoringOptions.property.addFailedTaskCountAlarm">addFailedTaskCountAlarm</a></code> | <code>{[ key: string ]: <a href="#cdk-monitoring-constructs.ErrorCountThreshold">ErrorCountThreshold</a>}</code> | *No description.* |
15511+
| <code><a href="#cdk-monitoring-constructs.GlueJobMonitoringOptions.property.addFailedTaskRateAlarm">addFailedTaskRateAlarm</a></code> | <code>{[ key: string ]: <a href="#cdk-monitoring-constructs.ErrorRateThreshold">ErrorRateThreshold</a>}</code> | *No description.* |
15512+
| <code><a href="#cdk-monitoring-constructs.GlueJobMonitoringOptions.property.addKilledTaskCountAlarm">addKilledTaskCountAlarm</a></code> | <code>{[ key: string ]: <a href="#cdk-monitoring-constructs.ErrorCountThreshold">ErrorCountThreshold</a>}</code> | *No description.* |
15513+
| <code><a href="#cdk-monitoring-constructs.GlueJobMonitoringOptions.property.addKilledTaskRateAlarm">addKilledTaskRateAlarm</a></code> | <code>{[ key: string ]: <a href="#cdk-monitoring-constructs.ErrorRateThreshold">ErrorRateThreshold</a>}</code> | *No description.* |
1551015514

1551115515
---
1551215516

@@ -15608,6 +15612,46 @@ Calls provided function to process all alarms created.
1560815612

1560915613
---
1561015614

15615+
##### `addFailedTaskCountAlarm`<sup>Optional</sup> <a name="addFailedTaskCountAlarm" id="cdk-monitoring-constructs.GlueJobMonitoringOptions.property.addFailedTaskCountAlarm"></a>
15616+
15617+
```typescript
15618+
public readonly addFailedTaskCountAlarm: {[ key: string ]: ErrorCountThreshold};
15619+
```
15620+
15621+
- *Type:* {[ key: string ]: <a href="#cdk-monitoring-constructs.ErrorCountThreshold">ErrorCountThreshold</a>}
15622+
15623+
---
15624+
15625+
##### `addFailedTaskRateAlarm`<sup>Optional</sup> <a name="addFailedTaskRateAlarm" id="cdk-monitoring-constructs.GlueJobMonitoringOptions.property.addFailedTaskRateAlarm"></a>
15626+
15627+
```typescript
15628+
public readonly addFailedTaskRateAlarm: {[ key: string ]: ErrorRateThreshold};
15629+
```
15630+
15631+
- *Type:* {[ key: string ]: <a href="#cdk-monitoring-constructs.ErrorRateThreshold">ErrorRateThreshold</a>}
15632+
15633+
---
15634+
15635+
##### `addKilledTaskCountAlarm`<sup>Optional</sup> <a name="addKilledTaskCountAlarm" id="cdk-monitoring-constructs.GlueJobMonitoringOptions.property.addKilledTaskCountAlarm"></a>
15636+
15637+
```typescript
15638+
public readonly addKilledTaskCountAlarm: {[ key: string ]: ErrorCountThreshold};
15639+
```
15640+
15641+
- *Type:* {[ key: string ]: <a href="#cdk-monitoring-constructs.ErrorCountThreshold">ErrorCountThreshold</a>}
15642+
15643+
---
15644+
15645+
##### `addKilledTaskRateAlarm`<sup>Optional</sup> <a name="addKilledTaskRateAlarm" id="cdk-monitoring-constructs.GlueJobMonitoringOptions.property.addKilledTaskRateAlarm"></a>
15646+
15647+
```typescript
15648+
public readonly addKilledTaskRateAlarm: {[ key: string ]: ErrorRateThreshold};
15649+
```
15650+
15651+
- *Type:* {[ key: string ]: <a href="#cdk-monitoring-constructs.ErrorRateThreshold">ErrorRateThreshold</a>}
15652+
15653+
---
15654+
1561115655
### GlueJobMonitoringProps <a name="GlueJobMonitoringProps" id="cdk-monitoring-constructs.GlueJobMonitoringProps"></a>
1561215656

1561315657
#### Initializer <a name="Initializer" id="cdk-monitoring-constructs.GlueJobMonitoringProps.Initializer"></a>
@@ -15629,6 +15673,10 @@ const glueJobMonitoringProps: GlueJobMonitoringProps = { ... }
1562915673
| <code><a href="#cdk-monitoring-constructs.GlueJobMonitoringProps.property.addToDetailDashboard">addToDetailDashboard</a></code> | <code>boolean</code> | Flag indicating if the widgets should be added to detailed dashboard. |
1563015674
| <code><a href="#cdk-monitoring-constructs.GlueJobMonitoringProps.property.addToSummaryDashboard">addToSummaryDashboard</a></code> | <code>boolean</code> | Flag indicating if the widgets should be added to summary dashboard. |
1563115675
| <code><a href="#cdk-monitoring-constructs.GlueJobMonitoringProps.property.useCreatedAlarms">useCreatedAlarms</a></code> | <code><a href="#cdk-monitoring-constructs.IAlarmConsumer">IAlarmConsumer</a></code> | Calls provided function to process all alarms created. |
15676+
| <code><a href="#cdk-monitoring-constructs.GlueJobMonitoringProps.property.addFailedTaskCountAlarm">addFailedTaskCountAlarm</a></code> | <code>{[ key: string ]: <a href="#cdk-monitoring-constructs.ErrorCountThreshold">ErrorCountThreshold</a>}</code> | *No description.* |
15677+
| <code><a href="#cdk-monitoring-constructs.GlueJobMonitoringProps.property.addFailedTaskRateAlarm">addFailedTaskRateAlarm</a></code> | <code>{[ key: string ]: <a href="#cdk-monitoring-constructs.ErrorRateThreshold">ErrorRateThreshold</a>}</code> | *No description.* |
15678+
| <code><a href="#cdk-monitoring-constructs.GlueJobMonitoringProps.property.addKilledTaskCountAlarm">addKilledTaskCountAlarm</a></code> | <code>{[ key: string ]: <a href="#cdk-monitoring-constructs.ErrorCountThreshold">ErrorCountThreshold</a>}</code> | *No description.* |
15679+
| <code><a href="#cdk-monitoring-constructs.GlueJobMonitoringProps.property.addKilledTaskRateAlarm">addKilledTaskRateAlarm</a></code> | <code>{[ key: string ]: <a href="#cdk-monitoring-constructs.ErrorRateThreshold">ErrorRateThreshold</a>}</code> | *No description.* |
1563215680
| <code><a href="#cdk-monitoring-constructs.GlueJobMonitoringProps.property.jobName">jobName</a></code> | <code>string</code> | *No description.* |
1563315681

1563415682
---
@@ -15731,6 +15779,46 @@ Calls provided function to process all alarms created.
1573115779

1573215780
---
1573315781

15782+
##### `addFailedTaskCountAlarm`<sup>Optional</sup> <a name="addFailedTaskCountAlarm" id="cdk-monitoring-constructs.GlueJobMonitoringProps.property.addFailedTaskCountAlarm"></a>
15783+
15784+
```typescript
15785+
public readonly addFailedTaskCountAlarm: {[ key: string ]: ErrorCountThreshold};
15786+
```
15787+
15788+
- *Type:* {[ key: string ]: <a href="#cdk-monitoring-constructs.ErrorCountThreshold">ErrorCountThreshold</a>}
15789+
15790+
---
15791+
15792+
##### `addFailedTaskRateAlarm`<sup>Optional</sup> <a name="addFailedTaskRateAlarm" id="cdk-monitoring-constructs.GlueJobMonitoringProps.property.addFailedTaskRateAlarm"></a>
15793+
15794+
```typescript
15795+
public readonly addFailedTaskRateAlarm: {[ key: string ]: ErrorRateThreshold};
15796+
```
15797+
15798+
- *Type:* {[ key: string ]: <a href="#cdk-monitoring-constructs.ErrorRateThreshold">ErrorRateThreshold</a>}
15799+
15800+
---
15801+
15802+
##### `addKilledTaskCountAlarm`<sup>Optional</sup> <a name="addKilledTaskCountAlarm" id="cdk-monitoring-constructs.GlueJobMonitoringProps.property.addKilledTaskCountAlarm"></a>
15803+
15804+
```typescript
15805+
public readonly addKilledTaskCountAlarm: {[ key: string ]: ErrorCountThreshold};
15806+
```
15807+
15808+
- *Type:* {[ key: string ]: <a href="#cdk-monitoring-constructs.ErrorCountThreshold">ErrorCountThreshold</a>}
15809+
15810+
---
15811+
15812+
##### `addKilledTaskRateAlarm`<sup>Optional</sup> <a name="addKilledTaskRateAlarm" id="cdk-monitoring-constructs.GlueJobMonitoringProps.property.addKilledTaskRateAlarm"></a>
15813+
15814+
```typescript
15815+
public readonly addKilledTaskRateAlarm: {[ key: string ]: ErrorRateThreshold};
15816+
```
15817+
15818+
- *Type:* {[ key: string ]: <a href="#cdk-monitoring-constructs.ErrorRateThreshold">ErrorRateThreshold</a>}
15819+
15820+
---
15821+
1573415822
##### `jobName`<sup>Required</sup> <a name="jobName" id="cdk-monitoring-constructs.GlueJobMonitoringProps.property.jobName"></a>
1573515823

1573615824
```typescript
@@ -39483,6 +39571,11 @@ new GlueJobMetricFactory(metricFactory: MetricFactory, jobName: string)
3948339571
| <code><a href="#cdk-monitoring-constructs.GlueJobMetricFactory.metricAverageExecutorCpuUsagePercentage">metricAverageExecutorCpuUsagePercentage</a></code> | *No description.* |
3948439572
| <code><a href="#cdk-monitoring-constructs.GlueJobMetricFactory.metricAverageExecutorMemoryUsagePercentage">metricAverageExecutorMemoryUsagePercentage</a></code> | *No description.* |
3948539573
| <code><a href="#cdk-monitoring-constructs.GlueJobMetricFactory.metricCompletedStagesSum">metricCompletedStagesSum</a></code> | *No description.* |
39574+
| <code><a href="#cdk-monitoring-constructs.GlueJobMetricFactory.metricCompletedTasksSum">metricCompletedTasksSum</a></code> | *No description.* |
39575+
| <code><a href="#cdk-monitoring-constructs.GlueJobMetricFactory.metricFailedTasksRate">metricFailedTasksRate</a></code> | *No description.* |
39576+
| <code><a href="#cdk-monitoring-constructs.GlueJobMetricFactory.metricFailedTasksSum">metricFailedTasksSum</a></code> | *No description.* |
39577+
| <code><a href="#cdk-monitoring-constructs.GlueJobMetricFactory.metricKilledTasksRate">metricKilledTasksRate</a></code> | *No description.* |
39578+
| <code><a href="#cdk-monitoring-constructs.GlueJobMetricFactory.metricKilledTasksSum">metricKilledTasksSum</a></code> | *No description.* |
3948639579
| <code><a href="#cdk-monitoring-constructs.GlueJobMetricFactory.metricMaximumNeededExecutors">metricMaximumNeededExecutors</a></code> | *No description.* |
3948739580
| <code><a href="#cdk-monitoring-constructs.GlueJobMetricFactory.metricTotalReadBytesFromS3">metricTotalReadBytesFromS3</a></code> | *No description.* |
3948839581
| <code><a href="#cdk-monitoring-constructs.GlueJobMetricFactory.metricTotalWrittenBytesToS3">metricTotalWrittenBytesToS3</a></code> | *No description.* |
@@ -39513,6 +39606,36 @@ public metricAverageExecutorMemoryUsagePercentage(): Metric | MathExpression
3951339606
public metricCompletedStagesSum(): Metric | MathExpression
3951439607
```
3951539608

39609+
##### `metricCompletedTasksSum` <a name="metricCompletedTasksSum" id="cdk-monitoring-constructs.GlueJobMetricFactory.metricCompletedTasksSum"></a>
39610+
39611+
```typescript
39612+
public metricCompletedTasksSum(): Metric | MathExpression
39613+
```
39614+
39615+
##### `metricFailedTasksRate` <a name="metricFailedTasksRate" id="cdk-monitoring-constructs.GlueJobMetricFactory.metricFailedTasksRate"></a>
39616+
39617+
```typescript
39618+
public metricFailedTasksRate(): Metric | MathExpression
39619+
```
39620+
39621+
##### `metricFailedTasksSum` <a name="metricFailedTasksSum" id="cdk-monitoring-constructs.GlueJobMetricFactory.metricFailedTasksSum"></a>
39622+
39623+
```typescript
39624+
public metricFailedTasksSum(): Metric | MathExpression
39625+
```
39626+
39627+
##### `metricKilledTasksRate` <a name="metricKilledTasksRate" id="cdk-monitoring-constructs.GlueJobMetricFactory.metricKilledTasksRate"></a>
39628+
39629+
```typescript
39630+
public metricKilledTasksRate(): Metric | MathExpression
39631+
```
39632+
39633+
##### `metricKilledTasksSum` <a name="metricKilledTasksSum" id="cdk-monitoring-constructs.GlueJobMetricFactory.metricKilledTasksSum"></a>
39634+
39635+
```typescript
39636+
public metricKilledTasksSum(): Metric | MathExpression
39637+
```
39638+
3951639639
##### `metricMaximumNeededExecutors` <a name="metricMaximumNeededExecutors" id="cdk-monitoring-constructs.GlueJobMetricFactory.metricMaximumNeededExecutors"></a>
3951739640

3951839641
```typescript
@@ -48030,6 +48153,7 @@ create a two sets of dashboards: standard set (interactive) and a copy (bitmap).
4803048153
| <code><a href="#cdk-monitoring-constructs.ErrorType.READ_ERROR">READ_ERROR</a></code> | *No description.* |
4803148154
| <code><a href="#cdk-monitoring-constructs.ErrorType.WRITE_ERROR">WRITE_ERROR</a></code> | *No description.* |
4803248155
| <code><a href="#cdk-monitoring-constructs.ErrorType.EXPIRED">EXPIRED</a></code> | *No description.* |
48156+
| <code><a href="#cdk-monitoring-constructs.ErrorType.KILLED">KILLED</a></code> | *No description.* |
4803348157

4803448158
---
4803548159

@@ -48088,6 +48212,11 @@ create a two sets of dashboards: standard set (interactive) and a copy (bitmap).
4808848212
---
4808948213

4809048214

48215+
##### `KILLED` <a name="KILLED" id="cdk-monitoring-constructs.ErrorType.KILLED"></a>
48216+
48217+
---
48218+
48219+
4809148220
### GraphWidgetType <a name="GraphWidgetType" id="cdk-monitoring-constructs.GraphWidgetType"></a>
4809248221

4809348222
#### Members <a name="Members" id="Members"></a>

lib/common/monitoring/alarms/ErrorAlarmFactory.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ export enum ErrorType {
1818
READ_ERROR = "ReadError",
1919
WRITE_ERROR = "WriteError",
2020
EXPIRED = "Expired",
21+
KILLED = "Killed",
2122
}
2223

2324
export interface ErrorCountThreshold extends CustomAlarmThreshold {

lib/monitoring/aws-glue/GlueJobMetricFactory.ts

Lines changed: 58 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
import { DimensionsMap } from "aws-cdk-lib/aws-cloudwatch";
22

3-
import { MetricFactory, MetricStatistic } from "../../common";
3+
import {
4+
MetricFactory,
5+
MetricStatistic,
6+
RateComputationMethod,
7+
} from "../../common";
48

59
const GlueNamespace = "Glue";
610

@@ -83,6 +87,59 @@ export class GlueJobMetricFactory {
8387
);
8488
}
8589

90+
metricCompletedTasksSum() {
91+
return this.metricFactory.createMetric(
92+
"glue.driver.aggregate.numCompletedTasks",
93+
MetricStatistic.SUM,
94+
"Completed Tasks",
95+
this.dimensionsMap,
96+
undefined,
97+
GlueNamespace
98+
);
99+
}
100+
101+
metricFailedTasksSum() {
102+
return this.metricFactory.createMetric(
103+
"glue.driver.aggregate.numFailedTasks",
104+
MetricStatistic.SUM,
105+
"Failed Tasks",
106+
this.dimensionsMap,
107+
undefined,
108+
GlueNamespace
109+
);
110+
}
111+
112+
metricFailedTasksRate() {
113+
return this.metricFactory.toRate(
114+
this.metricFailedTasksSum(),
115+
RateComputationMethod.AVERAGE,
116+
true,
117+
"killed",
118+
false
119+
);
120+
}
121+
122+
metricKilledTasksSum() {
123+
return this.metricFactory.createMetric(
124+
"glue.driver.aggregate.numKilledTasks",
125+
MetricStatistic.SUM,
126+
"Killed Tasks",
127+
this.dimensionsMap,
128+
undefined,
129+
GlueNamespace
130+
);
131+
}
132+
133+
metricKilledTasksRate() {
134+
return this.metricFactory.toRate(
135+
this.metricKilledTasksSum(),
136+
RateComputationMethod.AVERAGE,
137+
true,
138+
"killed",
139+
false
140+
);
141+
}
142+
86143
metricMaximumNeededExecutors() {
87144
return this.metricFactory.createMetric(
88145
"glue.driver.ExecutorAllocationManager.executors.numberMaxNeededExecutors",

0 commit comments

Comments
 (0)