Skip to content

Commit 049f4a0

Browse files
authored
Merge branch 'develop' into develop
2 parents 9f7f793 + 0b72979 commit 049f4a0

File tree

39 files changed

+1090
-169
lines changed

39 files changed

+1090
-169
lines changed

.github/workflows/dokken-system-tests.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ jobs:
3333
- alinux-2023
3434
- ubuntu2004
3535
- ubuntu2204
36+
- ubuntu2404
3637
- rhel8
3738
- rocky8
3839
- rhel9

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ This file is used to list changes made in each version of the AWS ParallelCluste
55

66
3.13.0
77
------
8+
**ENHANCEMENTS**
9+
- Add support for Ubuntu24.
810

911
**CHANGES**
1012
- Upgrade Python to 3.12.8 for all OSs except AL2 (from 3.9.20).
@@ -22,6 +24,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste
2224
- Upgrade ARM PL to version 24.10 (from 23.10).
2325
- Remove generation of DSA keys for login nodes as DSA, which became unsupported in OpenSSH 9.7+.
2426
- Set instance ID and instance type information in Slurm upon compute nodes launch.
27+
- Install NVIDIA drivers without the option 'no-cc-version-check', which is now deprecated in the NVIDIA installer.
2528

2629
3.12.0
2730
------
Lines changed: 243 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,243 @@
1+
# Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License").
4+
# You may not use this file except in compliance with the
5+
# License. A copy of the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
10+
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
11+
# limitations under the License.
12+
13+
14+
import argparse
15+
import base64
16+
import configparser
17+
import json
18+
import logging
19+
import os
20+
from email import message_from_string
21+
22+
import boto3
23+
import yaml
24+
from botocore.config import Config
25+
from retrying import retry
26+
27+
COMPUTE_FLEET_SHARED_LOCATION = "/opt/parallelcluster/shared/"
28+
29+
COMPUTE_FLEET_SHARED_DNA_LOCATION = COMPUTE_FLEET_SHARED_LOCATION + "dna/"
30+
31+
COMPUTE_FLEET_LAUNCH_TEMPLATE_CONFIG = COMPUTE_FLEET_SHARED_LOCATION + "launch-templates-config.json"
32+
33+
logger = logging.getLogger(__name__)
34+
logging.basicConfig(level=logging.INFO)
35+
36+
37+
def get_compute_launch_template_ids(lt_config_file_name):
38+
"""
39+
Load launch-templates-config.json.
40+
41+
It contains ID, Version number and Logical ID of all queues in Compute Fleet's Launch Template.
42+
43+
The format of launch-templates-config.json:
44+
{
45+
"Queues": {
46+
"queue1": {
47+
"ComputeResources": {
48+
"queue1-i1": {
49+
"LaunchTemplate": {
50+
"Version": "1",
51+
"LogicalId": "LaunchTemplate123456789012345",
52+
"Id": "lt-12345678901234567"
53+
}
54+
}
55+
}
56+
},
57+
"queue2": {
58+
"ComputeResources": {
59+
"queue2-i1": {
60+
"LaunchTemplate": {
61+
"Version": "1",
62+
"LogicalId": "LaunchTemplate012345678901234",
63+
"Id": "lt-01234567890123456"
64+
}
65+
}
66+
}
67+
}
68+
}
69+
}
70+
71+
"""
72+
lt_config = None
73+
try:
74+
with open(lt_config_file_name, "r", encoding="utf-8") as file:
75+
lt_config = json.loads(file.read())
76+
except Exception as err:
77+
logger.warning("Unable to read %s due to %s", lt_config_file_name, err)
78+
79+
return lt_config
80+
81+
82+
def share_compute_fleet_dna(args):
83+
"""Create dna.json for each queue in cluster."""
84+
lt_config = get_compute_launch_template_ids(COMPUTE_FLEET_LAUNCH_TEMPLATE_CONFIG)
85+
if lt_config:
86+
all_queues = lt_config.get("Queues")
87+
for _, queues in all_queues.items():
88+
compute_resources = queues.get("ComputeResources")
89+
for _, compute_res in compute_resources.items():
90+
get_latest_dna_data(compute_res, COMPUTE_FLEET_SHARED_DNA_LOCATION, args)
91+
92+
93+
# FIXME: Fix Code Duplication
94+
def parse_proxy_config():
95+
config = configparser.RawConfigParser()
96+
config.read("/etc/boto.cfg")
97+
proxy_config = Config()
98+
if config.has_option("Boto", "proxy") and config.has_option("Boto", "proxy_port"):
99+
proxy = config.get("Boto", "proxy")
100+
proxy_port = config.get("Boto", "proxy_port")
101+
proxy_config = Config(proxies={"https": f"{proxy}:{proxy_port}"})
102+
return proxy_config
103+
104+
105+
@retry(stop_max_attempt_number=5, wait_fixed=3000)
106+
def get_user_data(lt_id, lt_version, region_name):
107+
"""
108+
Get UserData from specified Launch Template using EC2 DescribeLaunchTemplateVersions API.
109+
110+
:param lt_id: Launch Template ID (eg: lt-12345678901234567)
111+
:param lt_version: Launch Template latest Version Number (eg: 2)
112+
:param region_name: AWS region name (eg: us-east-1)
113+
:return: string of user_data in MIME format
114+
"""
115+
decoded_data = None
116+
try:
117+
proxy_config = parse_proxy_config()
118+
119+
ec2_client = boto3.client("ec2", region_name=region_name, config=proxy_config)
120+
response = ec2_client.describe_launch_template_versions(
121+
LaunchTemplateId=lt_id,
122+
Versions=[
123+
lt_version,
124+
],
125+
).get("LaunchTemplateVersions")
126+
decoded_data = base64.b64decode(response[0]["LaunchTemplateData"]["UserData"], validate=True).decode("utf-8")
127+
except Exception as err:
128+
if hasattr(err, "message"):
129+
err = err.message
130+
logger.error(
131+
"Unable to get UserData for launch template %s with version %s.\nException: %s", lt_id, lt_version, err
132+
)
133+
134+
return decoded_data
135+
136+
137+
def get_write_directives_section(user_data):
138+
"""Get write_files section from cloud-config section of MIME formatted UserData."""
139+
write_directives_section = None
140+
try:
141+
data = message_from_string(user_data)
142+
for cloud_config_section in data.walk():
143+
if cloud_config_section.get_content_type() == "text/cloud-config":
144+
write_directives_section = yaml.safe_load(cloud_config_section._payload).get("write_files")
145+
except Exception as err:
146+
logger.error("Error occurred while parsing write_files section.\nException: %s", err)
147+
return write_directives_section
148+
149+
150+
def write_dna_files(write_files_section, shared_storage_loc):
151+
"""
152+
After extracting dna.json from write_files section of UserData, write it in shared location.
153+
154+
:param write_files_section: Entire write_files section from UserData
155+
:param shared_storage_loc: Shared Storage Location of where to write dna.json
156+
:return: None
157+
"""
158+
try:
159+
file_path = shared_storage_loc + "-dna.json"
160+
for data in write_files_section:
161+
if data["path"] in ["/tmp/dna.json"]: # nosec B108
162+
with open(file_path, "w", encoding="utf-8") as file:
163+
file.write(json.dumps(json.loads(data["content"]), indent=4))
164+
except Exception as err:
165+
if hasattr(err, "message"):
166+
err = err.message
167+
logger.error("Unable to write %s due to %s", file_path, err)
168+
169+
170+
def get_latest_dna_data(resource, output_location, args):
171+
"""
172+
Get latest User Data, extract relevant details and write dna.json.
173+
174+
:param resource: Resource containing LT ID, Version and Logical id
175+
:param output_location: Shared Storage Location were we want to write dna.json
176+
:param args: Command Line arguments
177+
:rtype: None
178+
"""
179+
user_data = get_user_data(
180+
resource.get("LaunchTemplate").get("Id"), resource.get("LaunchTemplate").get("Version"), args.region
181+
)
182+
if user_data:
183+
write_directives = get_write_directives_section(user_data)
184+
write_dna_files(write_directives, output_location + resource.get("LaunchTemplate").get("LogicalId"))
185+
186+
187+
def cleanup(directory_loc):
188+
"""Cleanup dna.json and extra.json files."""
189+
for f in os.listdir(directory_loc):
190+
f_path = os.path.join(directory_loc, f)
191+
try:
192+
if os.path.isfile(f_path):
193+
os.remove(f_path)
194+
except Exception as err:
195+
logger.warning("Unable to delete %s due to %s", f_path, err)
196+
197+
198+
def _parse_cli_args():
199+
"""Parse command line args."""
200+
parser = argparse.ArgumentParser(
201+
description="Get latest User Data from ComputeFleet Launch Templates.", exit_on_error=False
202+
)
203+
204+
parser.add_argument(
205+
"-r",
206+
"--region",
207+
required=False,
208+
type=str,
209+
default=os.getenv("AWS_REGION", None),
210+
help="the cluster AWS region, defaults to AWS_REGION env variable",
211+
)
212+
213+
parser.add_argument(
214+
"-c",
215+
"--cleanup",
216+
action="store_true",
217+
required=False,
218+
help="Cleanup DNA files created",
219+
)
220+
221+
args = parser.parse_args()
222+
223+
return args
224+
225+
226+
def main():
227+
try:
228+
args = _parse_cli_args()
229+
if args.cleanup:
230+
cleanup(COMPUTE_FLEET_SHARED_DNA_LOCATION)
231+
else:
232+
share_compute_fleet_dna(args)
233+
except Exception as err:
234+
if hasattr(err, "message"):
235+
err = err.message
236+
logger.exception(
237+
"Encountered exception when fetching latest dna.json for ComputeFleet, exiting gracefully: %s", err
238+
)
239+
raise SystemExit(0)
240+
241+
242+
if __name__ == "__main__":
243+
main()

cookbooks/aws-parallelcluster-environment/kitchen.environment-config.yml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,20 @@ suites:
314314
directory_service:
315315
enabled: "true"
316316
node_type: HeadNode
317+
- name: cfn_hup_configuration
318+
run_list:
319+
- recipe[aws-parallelcluster-tests::setup]
320+
- recipe[aws-parallelcluster-tests::test_resource]
321+
verifier:
322+
controls:
323+
- /tag:config_cfn_hup/
324+
attributes:
325+
resource: cfn_hup_configuration:configure
326+
dependencies:
327+
- recipe:aws-parallelcluster-platform::directories
328+
cluster:
329+
node_type: HeadNode
330+
stack_arn: 'test'
317331

318332
# Recipes
319333
- name: cfnconfig_mixed

cookbooks/aws-parallelcluster-environment/recipes/config.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,4 +38,4 @@
3838
# spack 'Configure Spack Packages' do
3939
# action :configure
4040
# end
41-
include_recipe 'aws-parallelcluster-environment::config_cfn_hup'
41+
cfn_hup_configuration "Configure cfn-hup"

cookbooks/aws-parallelcluster-environment/recipes/config/config_cfn_hup.rb

Lines changed: 0 additions & 61 deletions
This file was deleted.

cookbooks/aws-parallelcluster-environment/recipes/install/cfn_bootstrap.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@
7474

7575
# Add cfn-hup runner
7676
template "#{node['cluster']['scripts_dir']}/cfn-hup-runner.sh" do
77-
source "cfn_bootstrap/cfn-hup-runner.sh.erb"
77+
source "cfn_hup_configuration/cfn-hup-runner.sh.erb"
7878
owner 'root'
7979
group 'root'
8080
mode '0744'

0 commit comments

Comments
 (0)