Skip to content

Commit 2675a1f

Browse files
committed
feat(aws-glue-alpha): add optional metrics control for cost optimization
Add enableMetrics and enableObservabilityMetrics properties to SparkJobProps and RayJobProps interfaces, allowing users to disable CloudWatch metrics collection for cost control while maintaining backward compatibility. - Add conditional logic to exclude metrics arguments when disabled - Maintain defaults = true for backward compatibility - Apply same pattern to all 7 job types (6 Spark + 1 Ray) - Add comprehensive test coverage (8 new test cases) - Update README with cost optimization examples
1 parent 918593d commit 2675a1f

15 files changed

+1427
-4
lines changed

packages/@aws-cdk/aws-glue-alpha/README.md

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,36 @@ new glue.RayJob(stack, 'ImportedJob', {
343343
});
344344
```
345345

346+
### Metrics Control
347+
348+
By default, Glue jobs enable CloudWatch metrics (`--enable-metrics`) and observability metrics (`--enable-observability-metrics`) for monitoring and debugging. You can disable these metrics to reduce CloudWatch costs:
349+
350+
```ts
351+
import * as cdk from 'aws-cdk-lib';
352+
import * as iam from 'aws-cdk-lib/aws-iam';
353+
declare const stack: cdk.Stack;
354+
declare const role: iam.IRole;
355+
declare const script: glue.Code;
356+
357+
// Disable both metrics for cost optimization
358+
new glue.PySparkEtlJob(stack, 'CostOptimizedJob', {
359+
role,
360+
script,
361+
enableMetrics: false,
362+
enableObservabilityMetrics: false,
363+
});
364+
365+
// Selective control - keep observability, disable profiling
366+
new glue.PySparkEtlJob(stack, 'SelectiveJob', {
367+
role,
368+
script,
369+
enableMetrics: false,
370+
enableObservabilityMetrics: true,
371+
});
372+
```
373+
374+
This feature is available for all Spark job types (ETL, Streaming, Flex) and Ray jobs.
375+
346376
### Enable Job Run Queuing
347377

348378
AWS Glue job queuing monitors your account level quotas and limits. If quotas or limits are insufficient to start a Glue job run, AWS Glue will automatically queue the job and wait for limits to free up. Once limits become available, AWS Glue will retry the job run. Glue jobs will queue for limits like max concurrent job runs per account, max concurrent Data Processing Units (DPU), and resource unavailable due to IP address exhaustion in Amazon Virtual Private Cloud (Amazon VPC).

packages/@aws-cdk/aws-glue-alpha/lib/jobs/ray-job.ts

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,24 @@ export interface RayJobProps extends JobProps {
2929
* @default - no job run queuing
3030
*/
3131
readonly jobRunQueuingEnabled?: boolean;
32+
33+
/**
34+
* Enable profiling metrics for the Glue job.
35+
*
36+
* When enabled, adds '--enable-metrics' to job arguments.
37+
*
38+
* @default true - metrics are enabled by default for backward compatibility
39+
*/
40+
readonly enableMetrics?: boolean;
41+
42+
/**
43+
* Enable observability metrics for the Glue job.
44+
*
45+
* When enabled, adds '--enable-observability-metrics': 'true' to job arguments.
46+
*
47+
* @default true - observability metrics are enabled by default for backward compatibility
48+
*/
49+
readonly enableObservabilityMetrics?: boolean;
3250
}
3351

3452
/**
@@ -66,8 +84,10 @@ export class RayJob extends Job {
6684

6785
// Enable CloudWatch metrics and continuous logging by default as a best practice
6886
const continuousLoggingArgs = this.setupContinuousLogging(this.role, props.continuousLogging);
69-
const profilingMetricsArgs = { '--enable-metrics': '' };
70-
const observabilityMetricsArgs = { '--enable-observability-metrics': 'true' };
87+
88+
// Conditionally include metrics arguments (default to enabled for backward compatibility)
89+
const profilingMetricsArgs = (props.enableMetrics ?? true) ? { '--enable-metrics': '' } : {};
90+
const observabilityMetricsArgs = (props.enableObservabilityMetrics ?? true) ? { '--enable-observability-metrics': 'true' } : {};
7191

7292
// Combine command line arguments into a single line item
7393
const defaultArguments = {

packages/@aws-cdk/aws-glue-alpha/lib/jobs/spark-job.ts

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,24 @@ export interface SparkJobProps extends JobProps {
101101
* @see https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html
102102
*/
103103
readonly sparkUI?: SparkUIProps;
104+
105+
/**
106+
* Enable profiling metrics for the Glue job.
107+
*
108+
* When enabled, adds '--enable-metrics' to job arguments.
109+
*
110+
* @default true - metrics are enabled by default for backward compatibility
111+
*/
112+
readonly enableMetrics?: boolean;
113+
114+
/**
115+
* Enable observability metrics for the Glue job.
116+
*
117+
* When enabled, adds '--enable-observability-metrics': 'true' to job arguments.
118+
*
119+
* @default true - observability metrics are enabled by default for backward compatibility
120+
*/
121+
readonly enableObservabilityMetrics?: boolean;
104122
}
105123

106124
/**
@@ -134,8 +152,10 @@ export abstract class SparkJob extends Job {
134152
protected nonExecutableCommonArguments(props: SparkJobProps): {[key: string]: string} {
135153
// Enable CloudWatch metrics and continuous logging by default as a best practice
136154
const continuousLoggingArgs = this.setupContinuousLogging(this.role, props.continuousLogging);
137-
const profilingMetricsArgs = { '--enable-metrics': '' };
138-
const observabilityMetricsArgs = { '--enable-observability-metrics': 'true' };
155+
156+
// Conditionally include metrics arguments (default to enabled for backward compatibility)
157+
const profilingMetricsArgs = (props.enableMetrics ?? true) ? { '--enable-metrics': '' } : {};
158+
const observabilityMetricsArgs = (props.enableObservabilityMetrics ?? true) ? { '--enable-observability-metrics': 'true' } : {};
139159

140160
// Set spark ui args, if spark ui logging had been setup
141161
const sparkUIArgs = this.sparkUILoggingLocation ? ({

packages/@aws-cdk/aws-glue-alpha/test/integ.job-metrics-disabled.js.snapshot/asset.432033e3218068a915d2532fa9be7858a12b228a2ae6e5c10faccd9097b1e855.py

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

packages/@aws-cdk/aws-glue-alpha/test/integ.job-metrics-disabled.js.snapshot/aws-glue-job-metrics-disabled.assets.json

Lines changed: 34 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,234 @@
1+
{
2+
"Resources": {
3+
"IAMServiceRole61C662C4": {
4+
"Type": "AWS::IAM::Role",
5+
"Properties": {
6+
"AssumeRolePolicyDocument": {
7+
"Statement": [
8+
{
9+
"Action": "sts:AssumeRole",
10+
"Effect": "Allow",
11+
"Principal": {
12+
"Service": "glue.amazonaws.com"
13+
}
14+
}
15+
],
16+
"Version": "2012-10-17"
17+
},
18+
"ManagedPolicyArns": [
19+
{
20+
"Fn::Join": [
21+
"",
22+
[
23+
"arn:",
24+
{
25+
"Ref": "AWS::Partition"
26+
},
27+
":iam::aws:policy/service-role/AWSGlueServiceRole"
28+
]
29+
]
30+
}
31+
]
32+
}
33+
},
34+
"IAMServiceRoleDefaultPolicy379D1A0E": {
35+
"Type": "AWS::IAM::Policy",
36+
"Properties": {
37+
"PolicyDocument": {
38+
"Statement": [
39+
{
40+
"Action": [
41+
"s3:GetBucket*",
42+
"s3:GetObject*",
43+
"s3:List*"
44+
],
45+
"Effect": "Allow",
46+
"Resource": [
47+
{
48+
"Fn::Join": [
49+
"",
50+
[
51+
"arn:",
52+
{
53+
"Ref": "AWS::Partition"
54+
},
55+
":s3:::",
56+
{
57+
"Fn::Sub": "cdk-hnb659fds-assets-${AWS::AccountId}-${AWS::Region}"
58+
},
59+
"/*"
60+
]
61+
]
62+
},
63+
{
64+
"Fn::Join": [
65+
"",
66+
[
67+
"arn:",
68+
{
69+
"Ref": "AWS::Partition"
70+
},
71+
":s3:::",
72+
{
73+
"Fn::Sub": "cdk-hnb659fds-assets-${AWS::AccountId}-${AWS::Region}"
74+
}
75+
]
76+
]
77+
}
78+
]
79+
}
80+
],
81+
"Version": "2012-10-17"
82+
},
83+
"PolicyName": "IAMServiceRoleDefaultPolicy379D1A0E",
84+
"Roles": [
85+
{
86+
"Ref": "IAMServiceRole61C662C4"
87+
}
88+
]
89+
}
90+
},
91+
"PySparkETLJobNoMetrics83C22B7E": {
92+
"Type": "AWS::Glue::Job",
93+
"Properties": {
94+
"Command": {
95+
"Name": "glueetl",
96+
"PythonVersion": "3",
97+
"ScriptLocation": {
98+
"Fn::Join": [
99+
"",
100+
[
101+
"s3://",
102+
{
103+
"Fn::Sub": "cdk-hnb659fds-assets-${AWS::AccountId}-${AWS::Region}"
104+
},
105+
"/432033e3218068a915d2532fa9be7858a12b228a2ae6e5c10faccd9097b1e855.py"
106+
]
107+
]
108+
}
109+
},
110+
"DefaultArguments": {
111+
"--job-language": "python",
112+
"--enable-continuous-cloudwatch-log": "true"
113+
},
114+
"GlueVersion": "4.0",
115+
"JobRunQueuingEnabled": false,
116+
"Name": "PySparkETLJobNoMetrics",
117+
"NumberOfWorkers": 10,
118+
"Role": {
119+
"Fn::GetAtt": [
120+
"IAMServiceRole61C662C4",
121+
"Arn"
122+
]
123+
},
124+
"WorkerType": "G.1X"
125+
}
126+
},
127+
"RayJobNoMetricsBE507C8E": {
128+
"Type": "AWS::Glue::Job",
129+
"Properties": {
130+
"Command": {
131+
"Name": "glueray",
132+
"Runtime": "Ray2.4",
133+
"ScriptLocation": {
134+
"Fn::Join": [
135+
"",
136+
[
137+
"s3://",
138+
{
139+
"Fn::Sub": "cdk-hnb659fds-assets-${AWS::AccountId}-${AWS::Region}"
140+
},
141+
"/432033e3218068a915d2532fa9be7858a12b228a2ae6e5c10faccd9097b1e855.py"
142+
]
143+
]
144+
}
145+
},
146+
"DefaultArguments": {
147+
"--enable-continuous-cloudwatch-log": "true"
148+
},
149+
"GlueVersion": "4.0",
150+
"JobRunQueuingEnabled": false,
151+
"Name": "RayJobNoMetrics",
152+
"NumberOfWorkers": 3,
153+
"Role": {
154+
"Fn::GetAtt": [
155+
"IAMServiceRole61C662C4",
156+
"Arn"
157+
]
158+
},
159+
"WorkerType": "Z.2X"
160+
}
161+
},
162+
"PySparkETLJobSelectiveMetrics9B08F2F0": {
163+
"Type": "AWS::Glue::Job",
164+
"Properties": {
165+
"Command": {
166+
"Name": "glueetl",
167+
"PythonVersion": "3",
168+
"ScriptLocation": {
169+
"Fn::Join": [
170+
"",
171+
[
172+
"s3://",
173+
{
174+
"Fn::Sub": "cdk-hnb659fds-assets-${AWS::AccountId}-${AWS::Region}"
175+
},
176+
"/432033e3218068a915d2532fa9be7858a12b228a2ae6e5c10faccd9097b1e855.py"
177+
]
178+
]
179+
}
180+
},
181+
"DefaultArguments": {
182+
"--job-language": "python",
183+
"--enable-continuous-cloudwatch-log": "true",
184+
"--enable-observability-metrics": "true"
185+
},
186+
"GlueVersion": "4.0",
187+
"JobRunQueuingEnabled": false,
188+
"Name": "PySparkETLJobSelectiveMetrics",
189+
"NumberOfWorkers": 10,
190+
"Role": {
191+
"Fn::GetAtt": [
192+
"IAMServiceRole61C662C4",
193+
"Arn"
194+
]
195+
},
196+
"WorkerType": "G.1X"
197+
}
198+
}
199+
},
200+
"Parameters": {
201+
"BootstrapVersion": {
202+
"Type": "AWS::SSM::Parameter::Value<String>",
203+
"Default": "/cdk-bootstrap/hnb659fds/version",
204+
"Description": "Version of the CDK Bootstrap resources in this environment, automatically retrieved from SSM Parameter Store. [cdk:skip]"
205+
}
206+
},
207+
"Rules": {
208+
"CheckBootstrapVersion": {
209+
"Assertions": [
210+
{
211+
"Assert": {
212+
"Fn::Not": [
213+
{
214+
"Fn::Contains": [
215+
[
216+
"1",
217+
"2",
218+
"3",
219+
"4",
220+
"5"
221+
],
222+
{
223+
"Ref": "BootstrapVersion"
224+
}
225+
]
226+
}
227+
]
228+
},
229+
"AssertDescription": "CDK bootstrap stack version 6 required. Please run 'cdk bootstrap' with a recent version of the CDK CLI."
230+
}
231+
]
232+
}
233+
}
234+
}

0 commit comments

Comments
 (0)