Skip to content

Commit 5b49921

Browse files
author
Himani Anil Deshpande
committed
Add cfn-hup configuration resource
* Separate cfn-hup update hook for ComputeFleet * Add `get_compute_user_data.py` script to parse and get LaunchTemplates and parse them to write relevant DNA files. * Add invocation of script get_compute_user_data.py by headNode during an update * Writing dna.json files for each Launch template * Using launch template logical id for update action script * Update cfn-hup hook action script for Compute * chnage the owner, group and mode of dna and extra files in tmp * Share extra.json to Compute nodes * adding cleanup operation after an update * Update config_cfn_hup to be streamlined for node-specific configuration files
1 parent 3a6db7c commit 5b49921

File tree

10 files changed

+326
-62
lines changed

10 files changed

+326
-62
lines changed
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License").
4+
# You may not use this file except in compliance with the
5+
# License. A copy of the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
10+
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
11+
# limitations under the License.
12+
13+
14+
15+
import argparse
16+
from email import message_from_string
17+
import json
18+
import mimetypes
19+
import os
20+
import boto3
21+
import yaml
22+
import base64
23+
24+
SHARED_LOCATION = "/opt/parallelcluster/"
25+
26+
COMPUTE_FLEET_SHARED_LOCATION = SHARED_LOCATION + 'shared/'
27+
LOGIN_POOL_SHARED_LOCATION = SHARED_LOCATION + 'shared_login_nodes/'
28+
29+
COMPUTE_FLEET_DNA_LOC = COMPUTE_FLEET_SHARED_LOCATION + 'dna/'
30+
LOGIN_POOL_DNA_LOC = LOGIN_POOL_SHARED_LOCATION + 'dna/'
31+
32+
COMPUTE_FLEET_LAUNCH_TEMPLATE_ID = COMPUTE_FLEET_SHARED_LOCATION + 'launch-templates-config.json'
33+
34+
LOGIN_POOL_LAUNCH_TEMPLATE_ID = LOGIN_POOL_SHARED_LOCATION + 'launch-templates-config.json'
35+
36+
37+
38+
def get_launch_template_details(shared_storage):
39+
with open(shared_storage, 'r') as file:
40+
lt_config = json.loads(file.read())
41+
return lt_config
42+
43+
44+
def get_compute_launch_template_ids(args):
45+
lt_config = get_launch_template_details(COMPUTE_FLEET_LAUNCH_TEMPLATE_ID)
46+
if lt_config:
47+
all_queues = lt_config.get('Queues')
48+
for _, queues in all_queues.items():
49+
compute_resources = queues.get('ComputeResources')
50+
for _, compute_res in compute_resources.items():
51+
get_latest_dns_data(compute_res, COMPUTE_FLEET_DNA_LOC, args)
52+
53+
54+
def get_login_pool_launch_template_ids(args):
55+
lt_config = get_launch_template_details(LOGIN_POOL_LAUNCH_TEMPLATE_ID)
56+
if lt_config:
57+
login_pools = lt_config.get('LoginPools')
58+
for _, pool in login_pools.items():
59+
get_latest_dns_data(pool, LOGIN_POOL_DNA_LOC, args)
60+
61+
62+
def get_user_data(lt_id, lt_version, region_name):
63+
try:
64+
ec2_client = boto3.client("ec2", region_name=region_name)
65+
response = ec2_client.describe_launch_template_versions(
66+
LaunchTemplateId= lt_id,
67+
Versions=[
68+
lt_version,
69+
],
70+
).get('LaunchTemplateVersions')
71+
decoded_data = base64.b64decode(response[0]['LaunchTemplateData']['UserData'], validate=True).decode('utf-8')
72+
return decoded_data
73+
except Exception as e: # binascii.Error:
74+
print("Exception raised", e)
75+
76+
77+
def parse_mime_user_data(user_data):
78+
data = message_from_string(user_data)
79+
for cloud_config_section in data.walk():
80+
if cloud_config_section.get_content_type() == 'text/cloud-config':
81+
write_directives_section = yaml.safe_load(cloud_config_section._payload).get('write_files')
82+
83+
return write_directives_section
84+
85+
86+
def write_dna_files(write_files_section, shared_storage_loc):
87+
for data in write_files_section:
88+
if data['path'] in ['/tmp/dna.json']:
89+
with open(shared_storage_loc+"-dna.json" ,"w") as file:
90+
file.write(json.dumps(json.loads(data['content']),indent=4))
91+
92+
93+
def get_latest_dns_data(resource, output_location, args):
94+
user_data = get_user_data(resource.get('LaunchTemplate').get('Id'), resource.get('LaunchTemplate').get('Version'), args.region)
95+
write_directives = parse_mime_user_data(user_data)
96+
write_dna_files(write_directives, output_location+resource.get('LaunchTemplate').get("LogicalId"))
97+
98+
def cleanup(directory_loc):
99+
for f in os.listdir(directory_loc):
100+
f_path = os.path.join(directory_loc, f)
101+
try:
102+
if os.path.isfile(f_path):
103+
os.remove(f_path)
104+
except Exception as e:
105+
print(f"Error deleting {f_path}: {e}")
106+
107+
def _parse_cli_args():
108+
parser = argparse.ArgumentParser(
109+
description="Get latest User Data from Compute and Login Node Launch Templates.", exit_on_error=False
110+
)
111+
112+
parser.add_argument(
113+
"-r",
114+
"--region",
115+
type=str,
116+
default=os.getenv("AWS_REGION", None),
117+
required=False,
118+
help="the cluster AWS region, defaults to AWS_REGION env variable",
119+
)
120+
121+
parser.add_argument(
122+
"-c",
123+
"--cleanup",
124+
action="store_true",
125+
default=False,
126+
required=False,
127+
help="Cleanup DNA files created",
128+
)
129+
130+
args = parser.parse_args()
131+
132+
return args
133+
134+
135+
def main():
136+
args = _parse_cli_args()
137+
if args.cleanup:
138+
cleanup(COMPUTE_FLEET_DNA_LOC)
139+
cleanup(LOGIN_POOL_DNA_LOC)
140+
else:
141+
get_compute_launch_template_ids(args)
142+
#get_login_pool_launch_template_ids(args)
143+
144+
145+
if __name__ == "__main__":
146+
main()

cookbooks/aws-parallelcluster-environment/recipes/config.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,4 +38,4 @@
3838
# spack 'Configure Spack Packages' do
3939
# action :configure
4040
# end
41-
include_recipe 'aws-parallelcluster-environment::config_cfn_hup'
41+
cfn_hup_configuration "Configure Cfn-hup"

cookbooks/aws-parallelcluster-environment/recipes/config/config_cfn_hup.rb

Lines changed: 0 additions & 61 deletions
This file was deleted.
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
# frozen_string_literal: true
2+
3+
#
4+
# Copyright:: 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
5+
#
6+
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
7+
# License. A copy of the License is located at
8+
#
9+
# http://aws.amazon.com/apache2.0/
10+
#
11+
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
12+
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
provides :cfn_hup_configuration
16+
unified_mode true
17+
default_action :configure
18+
19+
action :configure do
20+
cloudformation_url = "https://cloudformation.#{node['cluster']['region']}.#{node['cluster']['aws_domain']}"
21+
instance_role_name = lambda {
22+
# IMDS is not available on Docker
23+
return "FAKE_INSTANCE_ROLE_NAME" if on_docker?
24+
get_metadata_with_token(get_metadata_token, URI("http://169.254.169.254/latest/meta-data/iam/security-credentials"))
25+
}.call
26+
27+
directory '/etc/cfn' do
28+
owner 'root'
29+
group 'root'
30+
mode '0770'
31+
recursive true
32+
end
33+
34+
directory '/etc/cfn/hooks.d' do
35+
owner 'root'
36+
group 'root'
37+
mode '0770'
38+
recursive true
39+
end
40+
41+
template '/etc/cfn/cfn-hup.conf' do
42+
source 'cfn_hup/cfn-hup.conf.erb'
43+
owner 'root'
44+
group 'root'
45+
mode '0400'
46+
variables(
47+
stack_id: node['cluster']['stack_arn'],
48+
region: node['cluster']['region'],
49+
cloudformation_url: cloudformation_url,
50+
cfn_init_role: instance_role_name
51+
)
52+
end
53+
54+
action_extra_configuration
55+
56+
template '/etc/cfn/hooks.d/pcluster-update.conf' do
57+
source "cfn_hup/cfn-hook-update.conf.erb"
58+
owner 'root'
59+
group 'root'
60+
mode '0400'
61+
variables(
62+
stack_id: node['cluster']['stack_arn'],
63+
region: node['cluster']['region'],
64+
cloudformation_url: cloudformation_url,
65+
cfn_init_role: instance_role_name,
66+
launch_template_resource_id: node['cluster']['launch_template_id'],
67+
update_hook_script_dir: node['cluster']['scripts_dir']
68+
)
69+
end
70+
end
71+
72+
action :extra_configuration do
73+
case node['cluster']['node_type']
74+
when 'HeadNode'
75+
cookbook_file "#{node['cluster']['scripts_dir']}/get_compute_user_data.py" do
76+
source 'cfn_hup/get_compute_user_data.py'
77+
owner 'root'
78+
group 'root'
79+
mode '0755'
80+
action :create_if_missing
81+
end
82+
83+
directory "#{node['cluster']['shared_dir']}/dna"
84+
85+
when 'ComputeFleet'
86+
template "#{node['cluster']['scripts_dir']}/cfn-hup-update-action.sh" do
87+
source "cfn_hup/#{node['cluster']['node_type']}/cfn-hup-update-action.sh.erb"
88+
owner 'root'
89+
group 'root'
90+
mode '0744' # TODO: Change permission
91+
variables(
92+
monitor_shared_dir: monitor_shared_dir,
93+
launch_template_resource_id: node['cluster']['launch_template_id']
94+
)
95+
end
96+
end
97+
end
98+
99+
action_class do
100+
def monitor_shared_dir
101+
"#{node['cluster']['shared_dir']}/dna"
102+
end
103+
end
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
#!/bin/bash
2+
set -ex
3+
4+
5+
6+
7+
function run_cookbook_recipes() {
8+
LATEST_DNA_LOC=<%= @monitor_shared_dir %>
9+
LATEST_DNA_FILE=$LATEST_DNA_LOC/<%= @launch_template_resource_id %>-dna.json
10+
LATEST_EXTRA_FILE=$LATEST_DNA_LOC/extra.json
11+
12+
GET_DNA_FILE=true
13+
while $GET_DNA_FILE; do
14+
if [[ -f $LATEST_DNA_FILE ]]; then
15+
GET_DNA_FILE=false
16+
cp $LATEST_DNA_FILE /tmp/dna.json
17+
chown root:root /tmp/dna.json
18+
chmod 000644 /tmp/dna.json
19+
cp $LATEST_EXTRA_FILE /tmp/extra.json
20+
chown root:root /tmp/extra.json
21+
chmod 000644 /tmp/extra.json
22+
mkdir -p /etc/chef/ohai/hints
23+
touch /etc/chef/ohai/hints/ec2.json
24+
jq -s ".[0] * .[1]" /tmp/dna.json /tmp/extra.json > /etc/chef/dna.json || ( echo "jq not installed"; cp /tmp/dna.json /etc/chef/dna.json )
25+
cd /etc/chef
26+
cinc-client --local-mode --config /etc/chef/client.rb --log_level info --logfile /var/log/chef-client.log --force-formatter --no-color --chef-zero-port 8889 --json-attributes /etc/chef/dna.json --override-runlist aws-parallelcluster-entrypoints::update && /opt/parallelcluster/scripts/fetch_and_run -postupdate
27+
28+
fi
29+
30+
sleep 60
31+
done
32+
}
33+
34+
35+
main() {
36+
PATH=/usr/local/bin:/bin:/usr/bin:/opt/aws/bin;
37+
. /etc/parallelcluster/pcluster_cookbook_environment.sh;
38+
echo "We monitor <%= @monitor_shared_dir %> to check for <%= @launch_template_resource_id %>-dna.json is being added"
39+
run_cookbook_recipes
40+
}
41+
42+
43+
main "$@"
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
[parallelcluster-update]
22
triggers=post.update
3+
<% case node['cluster']['node_type'] -%>
4+
<% when 'HeadNode', 'LoginNode' -%>
35
path=Resources.<%= @launch_template_resource_id %>.Metadata.AWS::CloudFormation::Init
46
action=PATH=/usr/local/bin:/bin:/usr/bin:/opt/aws/bin; . /etc/parallelcluster/pcluster_cookbook_environment.sh; $CFN_BOOTSTRAP_VIRTUALENV_PATH/cfn-init -v --stack <%= @stack_id %> --resource <%= @launch_template_resource_id %> --configsets update --region <%= @region %> --url <%= @cloudformation_url %> --role <%= @cfn_init_role %>
7+
<% when 'ComputeFleet' -%>
8+
path=Resources.<%= @launch_template_resource_id %>
9+
action=timeout 900 <%= @update_hook_script_dir %>/cfn-hup-update-action.sh
10+
<% end %>
511
runas=root

0 commit comments

Comments
 (0)