Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
# Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the
# License. A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
# limitations under the License.


import argparse
import base64
import configparser
import json
import logging
import os
from email import message_from_string

import boto3
import yaml
from botocore.config import Config
from retrying import retry

COMPUTE_FLEET_SHARED_LOCATION = "/opt/parallelcluster/shared/"

COMPUTE_FLEET_SHARED_DNA_LOCATION = COMPUTE_FLEET_SHARED_LOCATION + "dna/"

COMPUTE_FLEET_LAUNCH_TEMPLATE_CONFIG = COMPUTE_FLEET_SHARED_LOCATION + "launch-templates-config.json"

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)


def get_compute_launch_template_ids(lt_config_file_name):
"""
Load launch-templates-config.json.

It contains ID, Version number and Logical ID of all queues in Compute Fleet's Launch Template.

The format of launch-templates-config.json:
{
"Queues": {
"queue1": {
"ComputeResources": {
"queue1-i1": {
"LaunchTemplate": {
"Version": "1",
"LogicalId": "LaunchTemplate123456789012345",
"Id": "lt-12345678901234567"
}
}
}
},
"queue2": {
"ComputeResources": {
"queue2-i1": {
"LaunchTemplate": {
"Version": "1",
"LogicalId": "LaunchTemplate012345678901234",
"Id": "lt-01234567890123456"
}
}
}
}
}
}

"""
lt_config = None
try:
with open(lt_config_file_name, "r", encoding="utf-8") as file:
lt_config = json.loads(file.read())
except Exception as err:
logger.warning("Unable to read %s due to %s", lt_config_file_name, err)

return lt_config


def share_compute_fleet_dna(args):
"""Create dna.json for each queue in cluster."""
lt_config = get_compute_launch_template_ids(COMPUTE_FLEET_LAUNCH_TEMPLATE_CONFIG)
if lt_config:
all_queues = lt_config.get("Queues")
for _, queues in all_queues.items():
compute_resources = queues.get("ComputeResources")
for _, compute_res in compute_resources.items():
get_latest_dna_data(compute_res, COMPUTE_FLEET_SHARED_DNA_LOCATION, args)


# FIXME: Fix Code Duplication
def parse_proxy_config():
config = configparser.RawConfigParser()
config.read("/etc/boto.cfg")
proxy_config = Config()
if config.has_option("Boto", "proxy") and config.has_option("Boto", "proxy_port"):
proxy = config.get("Boto", "proxy")
proxy_port = config.get("Boto", "proxy_port")
proxy_config = Config(proxies={"https": f"{proxy}:{proxy_port}"})
return proxy_config


@retry(stop_max_attempt_number=5, wait_fixed=3000)
def get_user_data(lt_id, lt_version, region_name):
"""
Get UserData from specified Launch Template using EC2 DescribeLaunchTemplateVersions API.

:param lt_id: Launch Template ID (eg: lt-12345678901234567)
:param lt_version: Launch Template latest Version Number (eg: 2)
:param region_name: AWS region name (eg: us-east-1)
:return: string of user_data in MIME format
"""
decoded_data = None
try:
proxy_config = parse_proxy_config()

ec2_client = boto3.client("ec2", region_name=region_name, config=proxy_config)
response = ec2_client.describe_launch_template_versions(
LaunchTemplateId=lt_id,
Versions=[
lt_version,
],
).get("LaunchTemplateVersions")
decoded_data = base64.b64decode(response[0]["LaunchTemplateData"]["UserData"], validate=True).decode("utf-8")
except Exception as err:
if hasattr(err, "message"):
err = err.message
logger.error(
"Unable to get UserData for launch template %s with version %s.\nException: %s", lt_id, lt_version, err
)

return decoded_data


def get_write_directives_section(user_data):
"""Get write_files section from cloud-config section of MIME formatted UserData."""
write_directives_section = None
try:
data = message_from_string(user_data)
for cloud_config_section in data.walk():
if cloud_config_section.get_content_type() == "text/cloud-config":
write_directives_section = yaml.safe_load(cloud_config_section._payload).get("write_files")
except Exception as err:
logger.error("Error occurred while parsing write_files section.\nException: %s", err)
return write_directives_section


def write_dna_files(write_files_section, shared_storage_loc):
"""
After extracting dna.json from write_files section of UserData, write it in shared location.

:param write_files_section: Entire write_files section from UserData
:param shared_storage_loc: Shared Storage Location of where to write dna.json
:return: None
"""
try:
file_path = shared_storage_loc + "-dna.json"
for data in write_files_section:
if data["path"] in ["/tmp/dna.json"]: # nosec B108
with open(file_path, "w", encoding="utf-8") as file:
file.write(json.dumps(json.loads(data["content"]), indent=4))
except Exception as err:
if hasattr(err, "message"):
err = err.message
logger.error("Unable to write %s due to %s", file_path, err)


def get_latest_dna_data(resource, output_location, args):
"""
Get latest User Data, extract relevant details and write dna.json.

:param resource: Resource containing LT ID, Version and Logical id
:param output_location: Shared Storage Location were we want to write dna.json
:param args: Command Line arguments
:rtype: None
"""
user_data = get_user_data(
resource.get("LaunchTemplate").get("Id"), resource.get("LaunchTemplate").get("Version"), args.region
)
if user_data:
write_directives = get_write_directives_section(user_data)
write_dna_files(write_directives, output_location + resource.get("LaunchTemplate").get("LogicalId"))


def cleanup(directory_loc):
"""Cleanup dna.json and extra.json files."""
for f in os.listdir(directory_loc):
f_path = os.path.join(directory_loc, f)
try:
if os.path.isfile(f_path):
os.remove(f_path)
except Exception as err:
logger.warning("Unable to delete %s due to %s", f_path, err)


def _parse_cli_args():
"""Parse command line args."""
parser = argparse.ArgumentParser(
description="Get latest User Data from ComputeFleet Launch Templates.", exit_on_error=False
)

parser.add_argument(
"-r",
"--region",
required=False,
type=str,
default=os.getenv("AWS_REGION", None),
help="the cluster AWS region, defaults to AWS_REGION env variable",
)

parser.add_argument(
"-c",
"--cleanup",
action="store_true",
required=False,
help="Cleanup DNA files created",
)

args = parser.parse_args()

return args


def main():
try:
args = _parse_cli_args()
if args.cleanup:
cleanup(COMPUTE_FLEET_SHARED_DNA_LOCATION)
else:
share_compute_fleet_dna(args)
except Exception as err:
if hasattr(err, "message"):
err = err.message
logger.exception(
"Encountered exception when fetching latest dna.json for ComputeFleet, exiting gracefully: %s", err
)
raise SystemExit(0)


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,20 @@ suites:
directory_service:
enabled: "true"
node_type: HeadNode
- name: cfn_hup_configuration
run_list:
- recipe[aws-parallelcluster-tests::setup]
- recipe[aws-parallelcluster-tests::test_resource]
verifier:
controls:
- /tag:config_cfn_hup/
attributes:
resource: cfn_hup_configuration:configure
dependencies:
- recipe:aws-parallelcluster-platform::directories
cluster:
node_type: HeadNode
stack_arn: 'test'

# Recipes
- name: cfnconfig_mixed
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,4 @@
# spack 'Configure Spack Packages' do
# action :configure
# end
include_recipe 'aws-parallelcluster-environment::config_cfn_hup'
cfn_hup_configuration "Configure cfn-hup"

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@

# Add cfn-hup runner
template "#{node['cluster']['scripts_dir']}/cfn-hup-runner.sh" do
source "cfn_bootstrap/cfn-hup-runner.sh.erb"
source "cfn_hup_configuration/cfn-hup-runner.sh.erb"
owner 'root'
group 'root'
mode '0744'
Expand Down
Loading
Loading