From 7857bd85cfba218efdd53c61a9eb7a734438302c Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Mon, 3 Feb 2025 13:38:54 -0500 Subject: [PATCH 01/12] Add cfn-hup configuration resource * Separate cfn-hup update hook for ComputeFleet * Add `get_compute_user_data.py` script to parse and get LaunchTemplates and parse them to write relevant DNA files. * Add invocation of script get_compute_user_data.py by headNode during an update * Writing dna.json files for each Launch template * Using launch template logical id for update action script * Update cfn-hup hook action script for Compute * chnage the owner, group and mode of dna and extra files in tmp * Share extra.json to Compute nodes * adding cleanup operation after an update * Update config_cfn_hup to be streamlined for node-specific configuration files --- .../files/cfn_hup/get_compute_user_data.py | 146 ++++++++++++++++++ .../recipes/config.rb | 2 +- .../recipes/config/config_cfn_hup.rb | 61 -------- .../resources/config_cfn_hup.rb | 103 ++++++++++++ .../ComputeFleet/cfn-hup-update-action.sh.erb | 43 ++++++ .../cfn-hook-update.conf.erb | 6 + .../cfn-hup-runner.sh.erb | 0 .../cfn-hup.conf.erb | 0 .../resources/fetch_config.rb | 19 +++ .../recipes/update/update_head_node.rb | 8 + 10 files changed, 326 insertions(+), 62 deletions(-) create mode 100644 cookbooks/aws-parallelcluster-environment/files/cfn_hup/get_compute_user_data.py delete mode 100644 cookbooks/aws-parallelcluster-environment/recipes/config/config_cfn_hup.rb create mode 100644 cookbooks/aws-parallelcluster-environment/resources/config_cfn_hup.rb create mode 100644 cookbooks/aws-parallelcluster-environment/templates/cfn_hup/ComputeFleet/cfn-hup-update-action.sh.erb rename cookbooks/aws-parallelcluster-environment/templates/{cfn_bootstrap => cfn_hup}/cfn-hook-update.conf.erb (66%) rename cookbooks/aws-parallelcluster-environment/templates/{cfn_bootstrap => cfn_hup}/cfn-hup-runner.sh.erb (100%) rename cookbooks/aws-parallelcluster-environment/templates/{cfn_bootstrap => cfn_hup}/cfn-hup.conf.erb (100%) diff --git a/cookbooks/aws-parallelcluster-environment/files/cfn_hup/get_compute_user_data.py b/cookbooks/aws-parallelcluster-environment/files/cfn_hup/get_compute_user_data.py new file mode 100644 index 0000000000..7a116d1098 --- /dev/null +++ b/cookbooks/aws-parallelcluster-environment/files/cfn_hup/get_compute_user_data.py @@ -0,0 +1,146 @@ +# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + + + +import argparse +from email import message_from_string +import json +import mimetypes +import os +import boto3 +import yaml +import base64 + +SHARED_LOCATION = "/opt/parallelcluster/" + +COMPUTE_FLEET_SHARED_LOCATION = SHARED_LOCATION + 'shared/' +LOGIN_POOL_SHARED_LOCATION = SHARED_LOCATION + 'shared_login_nodes/' + +COMPUTE_FLEET_DNA_LOC = COMPUTE_FLEET_SHARED_LOCATION + 'dna/' +LOGIN_POOL_DNA_LOC = LOGIN_POOL_SHARED_LOCATION + 'dna/' + +COMPUTE_FLEET_LAUNCH_TEMPLATE_ID = COMPUTE_FLEET_SHARED_LOCATION + 'launch-templates-config.json' + +LOGIN_POOL_LAUNCH_TEMPLATE_ID = LOGIN_POOL_SHARED_LOCATION + 'launch-templates-config.json' + + + +def get_launch_template_details(shared_storage): + with open(shared_storage, 'r') as file: + lt_config = json.loads(file.read()) + return lt_config + + +def get_compute_launch_template_ids(args): + lt_config = get_launch_template_details(COMPUTE_FLEET_LAUNCH_TEMPLATE_ID) + if lt_config: + all_queues = lt_config.get('Queues') + for _, queues in all_queues.items(): + compute_resources = queues.get('ComputeResources') + for _, compute_res in compute_resources.items(): + get_latest_dns_data(compute_res, COMPUTE_FLEET_DNA_LOC, args) + + +def get_login_pool_launch_template_ids(args): + lt_config = get_launch_template_details(LOGIN_POOL_LAUNCH_TEMPLATE_ID) + if lt_config: + login_pools = lt_config.get('LoginPools') + for _, pool in login_pools.items(): + get_latest_dns_data(pool, LOGIN_POOL_DNA_LOC, args) + + +def get_user_data(lt_id, lt_version, region_name): + try: + ec2_client = boto3.client("ec2", region_name=region_name) + response = ec2_client.describe_launch_template_versions( + LaunchTemplateId= lt_id, + Versions=[ + lt_version, + ], + ).get('LaunchTemplateVersions') + decoded_data = base64.b64decode(response[0]['LaunchTemplateData']['UserData'], validate=True).decode('utf-8') + return decoded_data + except Exception as e: # binascii.Error: + print("Exception raised", e) + + +def parse_mime_user_data(user_data): + data = message_from_string(user_data) + for cloud_config_section in data.walk(): + if cloud_config_section.get_content_type() == 'text/cloud-config': + write_directives_section = yaml.safe_load(cloud_config_section._payload).get('write_files') + + return write_directives_section + + +def write_dna_files(write_files_section, shared_storage_loc): + for data in write_files_section: + if data['path'] in ['/tmp/dna.json']: + with open(shared_storage_loc+"-dna.json" ,"w") as file: + file.write(json.dumps(json.loads(data['content']),indent=4)) + + +def get_latest_dns_data(resource, output_location, args): + user_data = get_user_data(resource.get('LaunchTemplate').get('Id'), resource.get('LaunchTemplate').get('Version'), args.region) + write_directives = parse_mime_user_data(user_data) + write_dna_files(write_directives, output_location+resource.get('LaunchTemplate').get("LogicalId")) + +def cleanup(directory_loc): + for f in os.listdir(directory_loc): + f_path = os.path.join(directory_loc, f) + try: + if os.path.isfile(f_path): + os.remove(f_path) + except Exception as e: + print(f"Error deleting {f_path}: {e}") + +def _parse_cli_args(): + parser = argparse.ArgumentParser( + description="Get latest User Data from Compute and Login Node Launch Templates.", exit_on_error=False + ) + + parser.add_argument( + "-r", + "--region", + type=str, + default=os.getenv("AWS_REGION", None), + required=False, + help="the cluster AWS region, defaults to AWS_REGION env variable", + ) + + parser.add_argument( + "-c", + "--cleanup", + action="store_true", + default=False, + required=False, + help="Cleanup DNA files created", + ) + + args = parser.parse_args() + + return args + + +def main(): + args = _parse_cli_args() + if args.cleanup: + cleanup(COMPUTE_FLEET_DNA_LOC) + cleanup(LOGIN_POOL_DNA_LOC) + else: + get_compute_launch_template_ids(args) + #get_login_pool_launch_template_ids(args) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config.rb b/cookbooks/aws-parallelcluster-environment/recipes/config.rb index b0c0057e40..1326bca930 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/config.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/config.rb @@ -38,4 +38,4 @@ # spack 'Configure Spack Packages' do # action :configure # end -include_recipe 'aws-parallelcluster-environment::config_cfn_hup' +cfn_hup_configuration "Configure Cfn-hup" diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config/config_cfn_hup.rb b/cookbooks/aws-parallelcluster-environment/recipes/config/config_cfn_hup.rb deleted file mode 100644 index 7a82d57773..0000000000 --- a/cookbooks/aws-parallelcluster-environment/recipes/config/config_cfn_hup.rb +++ /dev/null @@ -1,61 +0,0 @@ -# frozen_string_literal: true - -# -# Copyright:: 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the -# License. A copy of the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and -# limitations under the License. - -cloudformation_url = "https://cloudformation.#{node['cluster']['region']}.#{node['cluster']['aws_domain']}" -instance_role_name = lambda { - # IMDS is not available on Docker - return "FAKE_INSTANCE_ROLE_NAME" if on_docker? - get_metadata_with_token(get_metadata_token, URI("http://169.254.169.254/latest/meta-data/iam/security-credentials")) -}.call - -directory '/etc/cfn' do - owner 'root' - group 'root' - mode '0770' - recursive true -end - -directory '/etc/cfn/hooks.d' do - owner 'root' - group 'root' - mode '0770' - recursive true -end - -template '/etc/cfn/cfn-hup.conf' do - source 'cfn_bootstrap/cfn-hup.conf.erb' - owner 'root' - group 'root' - mode '0400' - variables( - stack_id: node['cluster']['stack_arn'], - region: node['cluster']['region'], - cloudformation_url: cloudformation_url, - cfn_init_role: instance_role_name - ) -end - -template '/etc/cfn/hooks.d/pcluster-update.conf' do - source 'cfn_bootstrap/cfn-hook-update.conf.erb' - owner 'root' - group 'root' - mode '0400' - variables( - stack_id: node['cluster']['stack_arn'], - region: node['cluster']['region'], - cloudformation_url: cloudformation_url, - cfn_init_role: instance_role_name, - launch_template_resource_id: node['cluster']['launch_template_id'] - ) -end diff --git a/cookbooks/aws-parallelcluster-environment/resources/config_cfn_hup.rb b/cookbooks/aws-parallelcluster-environment/resources/config_cfn_hup.rb new file mode 100644 index 0000000000..e09daab6e4 --- /dev/null +++ b/cookbooks/aws-parallelcluster-environment/resources/config_cfn_hup.rb @@ -0,0 +1,103 @@ +# frozen_string_literal: true + +# +# Copyright:: 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +provides :cfn_hup_configuration +unified_mode true +default_action :configure + +action :configure do + cloudformation_url = "https://cloudformation.#{node['cluster']['region']}.#{node['cluster']['aws_domain']}" + instance_role_name = lambda { + # IMDS is not available on Docker + return "FAKE_INSTANCE_ROLE_NAME" if on_docker? + get_metadata_with_token(get_metadata_token, URI("http://169.254.169.254/latest/meta-data/iam/security-credentials")) + }.call + + directory '/etc/cfn' do + owner 'root' + group 'root' + mode '0770' + recursive true + end + + directory '/etc/cfn/hooks.d' do + owner 'root' + group 'root' + mode '0770' + recursive true + end + + template '/etc/cfn/cfn-hup.conf' do + source 'cfn_hup/cfn-hup.conf.erb' + owner 'root' + group 'root' + mode '0400' + variables( + stack_id: node['cluster']['stack_arn'], + region: node['cluster']['region'], + cloudformation_url: cloudformation_url, + cfn_init_role: instance_role_name + ) + end + + action_extra_configuration + + template '/etc/cfn/hooks.d/pcluster-update.conf' do + source "cfn_hup/cfn-hook-update.conf.erb" + owner 'root' + group 'root' + mode '0400' + variables( + stack_id: node['cluster']['stack_arn'], + region: node['cluster']['region'], + cloudformation_url: cloudformation_url, + cfn_init_role: instance_role_name, + launch_template_resource_id: node['cluster']['launch_template_id'], + update_hook_script_dir: node['cluster']['scripts_dir'] + ) + end +end + +action :extra_configuration do + case node['cluster']['node_type'] + when 'HeadNode' + cookbook_file "#{node['cluster']['scripts_dir']}/get_compute_user_data.py" do + source 'cfn_hup/get_compute_user_data.py' + owner 'root' + group 'root' + mode '0755' + action :create_if_missing + end + + directory "#{node['cluster']['shared_dir']}/dna" + + when 'ComputeFleet' + template "#{node['cluster']['scripts_dir']}/cfn-hup-update-action.sh" do + source "cfn_hup/#{node['cluster']['node_type']}/cfn-hup-update-action.sh.erb" + owner 'root' + group 'root' + mode '0744' # TODO: Change permission + variables( + monitor_shared_dir: monitor_shared_dir, + launch_template_resource_id: node['cluster']['launch_template_id'] + ) + end + end +end + +action_class do + def monitor_shared_dir + "#{node['cluster']['shared_dir']}/dna" + end +end diff --git a/cookbooks/aws-parallelcluster-environment/templates/cfn_hup/ComputeFleet/cfn-hup-update-action.sh.erb b/cookbooks/aws-parallelcluster-environment/templates/cfn_hup/ComputeFleet/cfn-hup-update-action.sh.erb new file mode 100644 index 0000000000..b16424dbca --- /dev/null +++ b/cookbooks/aws-parallelcluster-environment/templates/cfn_hup/ComputeFleet/cfn-hup-update-action.sh.erb @@ -0,0 +1,43 @@ +#!/bin/bash +set -ex + + + + +function run_cookbook_recipes() { + LATEST_DNA_LOC=<%= @monitor_shared_dir %> + LATEST_DNA_FILE=$LATEST_DNA_LOC/<%= @launch_template_resource_id %>-dna.json + LATEST_EXTRA_FILE=$LATEST_DNA_LOC/extra.json + + GET_DNA_FILE=true + while $GET_DNA_FILE; do + if [[ -f $LATEST_DNA_FILE ]]; then + GET_DNA_FILE=false + cp $LATEST_DNA_FILE /tmp/dna.json + chown root:root /tmp/dna.json + chmod 000644 /tmp/dna.json + cp $LATEST_EXTRA_FILE /tmp/extra.json + chown root:root /tmp/extra.json + chmod 000644 /tmp/extra.json + mkdir -p /etc/chef/ohai/hints + touch /etc/chef/ohai/hints/ec2.json + jq -s ".[0] * .[1]" /tmp/dna.json /tmp/extra.json > /etc/chef/dna.json || ( echo "jq not installed"; cp /tmp/dna.json /etc/chef/dna.json ) + cd /etc/chef + cinc-client --local-mode --config /etc/chef/client.rb --log_level info --logfile /var/log/chef-client.log --force-formatter --no-color --chef-zero-port 8889 --json-attributes /etc/chef/dna.json --override-runlist aws-parallelcluster-entrypoints::update && /opt/parallelcluster/scripts/fetch_and_run -postupdate + + fi + + sleep 60 + done +} + + +main() { + PATH=/usr/local/bin:/bin:/usr/bin:/opt/aws/bin; + . /etc/parallelcluster/pcluster_cookbook_environment.sh; + echo "We monitor <%= @monitor_shared_dir %> to check for <%= @launch_template_resource_id %>-dna.json is being added" + run_cookbook_recipes +} + + +main "$@" diff --git a/cookbooks/aws-parallelcluster-environment/templates/cfn_bootstrap/cfn-hook-update.conf.erb b/cookbooks/aws-parallelcluster-environment/templates/cfn_hup/cfn-hook-update.conf.erb similarity index 66% rename from cookbooks/aws-parallelcluster-environment/templates/cfn_bootstrap/cfn-hook-update.conf.erb rename to cookbooks/aws-parallelcluster-environment/templates/cfn_hup/cfn-hook-update.conf.erb index 075895042f..ea18afa144 100644 --- a/cookbooks/aws-parallelcluster-environment/templates/cfn_bootstrap/cfn-hook-update.conf.erb +++ b/cookbooks/aws-parallelcluster-environment/templates/cfn_hup/cfn-hook-update.conf.erb @@ -1,5 +1,11 @@ [parallelcluster-update] triggers=post.update +<% case node['cluster']['node_type'] -%> +<% when 'HeadNode', 'LoginNode' -%> path=Resources.<%= @launch_template_resource_id %>.Metadata.AWS::CloudFormation::Init action=PATH=/usr/local/bin:/bin:/usr/bin:/opt/aws/bin; . /etc/parallelcluster/pcluster_cookbook_environment.sh; $CFN_BOOTSTRAP_VIRTUALENV_PATH/cfn-init -v --stack <%= @stack_id %> --resource <%= @launch_template_resource_id %> --configsets update --region <%= @region %> --url <%= @cloudformation_url %> --role <%= @cfn_init_role %> +<% when 'ComputeFleet' -%> +path=Resources.<%= @launch_template_resource_id %> +action=timeout 900 <%= @update_hook_script_dir %>/cfn-hup-update-action.sh +<% end %> runas=root diff --git a/cookbooks/aws-parallelcluster-environment/templates/cfn_bootstrap/cfn-hup-runner.sh.erb b/cookbooks/aws-parallelcluster-environment/templates/cfn_hup/cfn-hup-runner.sh.erb similarity index 100% rename from cookbooks/aws-parallelcluster-environment/templates/cfn_bootstrap/cfn-hup-runner.sh.erb rename to cookbooks/aws-parallelcluster-environment/templates/cfn_hup/cfn-hup-runner.sh.erb diff --git a/cookbooks/aws-parallelcluster-environment/templates/cfn_bootstrap/cfn-hup.conf.erb b/cookbooks/aws-parallelcluster-environment/templates/cfn_hup/cfn-hup.conf.erb similarity index 100% rename from cookbooks/aws-parallelcluster-environment/templates/cfn_bootstrap/cfn-hup.conf.erb rename to cookbooks/aws-parallelcluster-environment/templates/cfn_hup/cfn-hup.conf.erb diff --git a/cookbooks/aws-parallelcluster-platform/resources/fetch_config.rb b/cookbooks/aws-parallelcluster-platform/resources/fetch_config.rb index 0f2b492ad0..c4b674d8c4 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fetch_config.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fetch_config.rb @@ -9,6 +9,23 @@ default_action :run + +action :share_dna_files do + return if on_docker? + + Chef::Log.info("Share extra.json with all nodes") + ::FileUtils.cp_r('/tmp/extra.json', "#{node['cluster']['shared_dir']}/dna/extra.json",remove_destination: true) + ::FileUtils.cp_r('/tmp/extra.json', "#{node['cluster']['shared_dir_login_nodes']}/dna/extra.json",remove_destination: true) + + execute "Share DNA files" do + command "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/get_compute_user_data.py" \ + " --region #{node['cluster']['region']}" + timeout 30 + retries 10 + retry_delay 90 + end +end + action :run do return if on_docker? Chef::Log.debug("Called fetch_config with update (#{new_resource.update})") @@ -19,6 +36,8 @@ case node['cluster']['node_type'] when 'HeadNode' if new_resource.update + action_share_dna_files + Chef::Log.info("Backing up old configuration from (#{node['cluster']['cluster_config_path']}) to (#{node['cluster']['previous_cluster_config_path']})") ::FileUtils.cp_r(node['cluster']['cluster_config_path'], node['cluster']['previous_cluster_config_path'], remove_destination: true) fetch_cluster_config(node['cluster']['cluster_config_path']) diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb b/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb index 9358dd0e78..f212c23260 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb @@ -281,3 +281,11 @@ def update_nodes_in_queue(strategy, queues) cookbook 'aws-parallelcluster-environment' mode '0644' end + +execute "Cleanup DNA files" do + command "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/get_compute_user_data.py" \ + " --region #{node['cluster']['region']} --cleanup" + timeout 30 + retries 10 + retry_delay 90 +end \ No newline at end of file From 988e583961e9817133f4f1e8b8da1a4bc00b14a4 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Thu, 6 Feb 2025 13:30:05 -0500 Subject: [PATCH 02/12] [UnitTest] Add unit test for cfn_hup_configuration resource --- ...ig_cfn_hup.rb => cfn_hup_configuration.rb} | 0 .../unit/resources/config_cfn_hup_spec.rb | 116 ++++++++++++++++++ 2 files changed, 116 insertions(+) rename cookbooks/aws-parallelcluster-environment/resources/{config_cfn_hup.rb => cfn_hup_configuration.rb} (100%) create mode 100644 cookbooks/aws-parallelcluster-environment/spec/unit/resources/config_cfn_hup_spec.rb diff --git a/cookbooks/aws-parallelcluster-environment/resources/config_cfn_hup.rb b/cookbooks/aws-parallelcluster-environment/resources/cfn_hup_configuration.rb similarity index 100% rename from cookbooks/aws-parallelcluster-environment/resources/config_cfn_hup.rb rename to cookbooks/aws-parallelcluster-environment/resources/cfn_hup_configuration.rb diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/config_cfn_hup_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/config_cfn_hup_spec.rb new file mode 100644 index 0000000000..a1613c7fb8 --- /dev/null +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/config_cfn_hup_spec.rb @@ -0,0 +1,116 @@ +require 'spec_helper' + +class ConvergeCfnHupConfiguration + def self.configure(chef_run) + chef_run.converge_dsl('aws-parallelcluster-environment') do + cfn_hup_configuration 'configure' do + action :configure + end + end + end +end + +AWS_REGION = "AWS_REGION".freeze +AWS_DOMAIN = "AWS_DOMAIN".freeze +STACK_ID = "STACK_ID".freeze +CLOUDFORMATION_URL = "https://cloudformation.#{AWS_REGION}.#{AWS_DOMAIN}".freeze +INSTANCE_ROLE_NAME = "INSTANCE_ROLE_NAME".freeze +LAUNCH_TEMPLATE_ID = "LAUNCH_TEMPLATE_ID".freeze +SCRIPT_DIR = "SCRIPT_DIR".freeze +MONITOR_SHARED_DIR = "MONITOR_SHARED_DIR".freeze + +describe 'cfn_hup_configuration:configure' do + for_all_oses do |platform, version| + context "on #{platform}#{version}" do + for_all_node_types do |node_type| + context "when #{node_type}" do + cached(:chef_run) do + runner = runner(platform: platform, version: version, step_into: ['cfn_hup_configuration']) do |node| + allow_any_instance_of(Object).to receive(:get_metadata_token).and_return("IMDS_TOKEN") + allow_any_instance_of(Object).to receive(:get_metadata_with_token) + .with("IMDS_TOKEN", URI("http://169.254.169.254/latest/meta-data/iam/security-credentials")) + .and_return(INSTANCE_ROLE_NAME) + node.override["cluster"]["node_type"] = node_type + node.override["cluster"]["region"] = AWS_REGION + node.override["cluster"]["aws_domain"] = AWS_DOMAIN + # TODO: We inject the stack id into the attribute stack_arn when generating the dna.json in the CLI. + # This should be fixed at the CLI level first and adapt the cookbook accordingly. + node.override["cluster"]["stack_arn"] = STACK_ID + node.override["cluster"]["launch_template_id"] = LAUNCH_TEMPLATE_ID + node.override['cluster']['scripts_dir'] = SCRIPT_DIR + node.override['cluster']['shared_dir'] = MONITOR_SHARED_DIR + end + ConvergeCfnHupConfiguration.configure(runner) + end + cached(:node) { chef_run.node } + + %w(/etc/cfn /etc/cfn/hooks.d).each do |dir| + it "creates the directory #{dir}" do + is_expected.to create_directory(dir) + .with(owner: 'root') + .with(group: 'root') + .with(mode: "0770") + .with(recursive: true) + end + end + + it "creates the file /etc/cfn/cfn-hup.conf" do + is_expected.to create_template("/etc/cfn/cfn-hup.conf") + .with(source: 'cfn_hup/cfn-hup.conf.erb') + .with(user: "root") + .with(group: "root") + .with(mode: "0400") + .with(variables: { + stack_id: STACK_ID, + region: AWS_REGION, + cloudformation_url: CLOUDFORMATION_URL, + cfn_init_role: INSTANCE_ROLE_NAME, + }) + end + + it "creates the file /etc/cfn/hooks.d/pcluster-update.conf" do + is_expected.to create_template("/etc/cfn/hooks.d/pcluster-update.conf") + .with(source: 'cfn_hup/cfn-hook-update.conf.erb') + .with(user: "root") + .with(group: "root") + .with(mode: "0400") + .with(variables: { + stack_id: STACK_ID, + region: AWS_REGION, + cloudformation_url: CLOUDFORMATION_URL, + cfn_init_role: INSTANCE_ROLE_NAME, + launch_template_resource_id: LAUNCH_TEMPLATE_ID, + update_hook_script_dir: SCRIPT_DIR, + }) + end + + if %(ComputeFleet).include?(node_type) + it "creates the file #{SCRIPT_DIR}/cfn-hup-update-action.sh" do + is_expected.to create_template("#{SCRIPT_DIR}/cfn-hup-update-action.sh") + .with(source: "cfn_hup/#{node_type}/cfn-hup-update-action.sh.erb") + .with(user: "root") + .with(group: "root") + .with(mode: "0744") + .with(variables: { + monitor_shared_dir: "#{MONITOR_SHARED_DIR}/dna", + launch_template_resource_id: LAUNCH_TEMPLATE_ID, + }) + end + elsif node_type == 'HeadNode' + it "creates #{SCRIPT_DIR}/get_compute_user_data.py" do + is_expected.to create_if_missing_cookbook_file("#{SCRIPT_DIR}/get_compute_user_data.py") + .with(source: 'cfn_hup/get_compute_user_data.py') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0755') + end + + it "creates the directory #{MONITOR_SHARED_DIR}/dna" do + is_expected.to create_directory("#{MONITOR_SHARED_DIR}/dna") + end + end + end + end + end + end +end From f44b38c05add2eb759585b38252d181b2a5b7b4b Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Thu, 6 Feb 2025 13:40:03 -0500 Subject: [PATCH 03/12] Adding fetch_dna_files resource for HeadNode invocation of sharing and cleaning up dna.json and extra.json during an update * Renaming the files and folders to cfn_hup_configuration * Deleting old recipie config_cfn_hup_spec.rb --- .../get_compute_user_data.py | 0 .../resources/cfn_hup_configuration.rb | 8 +- .../spec/unit/recipes/config_cfn_hup_spec.rb | 90 ------------------- ..._spec.rb => cfn_hup_configuration_spec.rb} | 8 +- .../ComputeFleet/cfn-hup-update-action.sh.erb | 0 .../cfn-hook-update.conf.erb | 0 .../cfn-hup-runner.sh.erb | 0 .../cfn-hup.conf.erb | 0 .../recipes/update.rb | 1 + .../resources/fetch_dna_files.rb | 37 ++++++++ cookbooks/aws-parallelcluster-slurm/Berksfile | 1 + .../aws-parallelcluster-slurm/metadata.rb | 1 + .../recipes/update/update_head_node.rb | 8 +- 13 files changed, 50 insertions(+), 104 deletions(-) rename cookbooks/aws-parallelcluster-environment/files/{cfn_hup => cfn_hup_configuration}/get_compute_user_data.py (100%) delete mode 100644 cookbooks/aws-parallelcluster-environment/spec/unit/recipes/config_cfn_hup_spec.rb rename cookbooks/aws-parallelcluster-environment/spec/unit/resources/{config_cfn_hup_spec.rb => cfn_hup_configuration_spec.rb} (93%) rename cookbooks/aws-parallelcluster-environment/templates/{cfn_hup => cfn_hup_configuration}/ComputeFleet/cfn-hup-update-action.sh.erb (100%) rename cookbooks/aws-parallelcluster-environment/templates/{cfn_hup => cfn_hup_configuration}/cfn-hook-update.conf.erb (100%) rename cookbooks/aws-parallelcluster-environment/templates/{cfn_hup => cfn_hup_configuration}/cfn-hup-runner.sh.erb (100%) rename cookbooks/aws-parallelcluster-environment/templates/{cfn_hup => cfn_hup_configuration}/cfn-hup.conf.erb (100%) create mode 100644 cookbooks/aws-parallelcluster-platform/resources/fetch_dna_files.rb diff --git a/cookbooks/aws-parallelcluster-environment/files/cfn_hup/get_compute_user_data.py b/cookbooks/aws-parallelcluster-environment/files/cfn_hup_configuration/get_compute_user_data.py similarity index 100% rename from cookbooks/aws-parallelcluster-environment/files/cfn_hup/get_compute_user_data.py rename to cookbooks/aws-parallelcluster-environment/files/cfn_hup_configuration/get_compute_user_data.py diff --git a/cookbooks/aws-parallelcluster-environment/resources/cfn_hup_configuration.rb b/cookbooks/aws-parallelcluster-environment/resources/cfn_hup_configuration.rb index e09daab6e4..e09d6e87fe 100644 --- a/cookbooks/aws-parallelcluster-environment/resources/cfn_hup_configuration.rb +++ b/cookbooks/aws-parallelcluster-environment/resources/cfn_hup_configuration.rb @@ -39,7 +39,7 @@ end template '/etc/cfn/cfn-hup.conf' do - source 'cfn_hup/cfn-hup.conf.erb' + source 'cfn_hup_configuration/cfn-hup.conf.erb' owner 'root' group 'root' mode '0400' @@ -54,7 +54,7 @@ action_extra_configuration template '/etc/cfn/hooks.d/pcluster-update.conf' do - source "cfn_hup/cfn-hook-update.conf.erb" + source "cfn_hup_configuration/cfn-hook-update.conf.erb" owner 'root' group 'root' mode '0400' @@ -73,7 +73,7 @@ case node['cluster']['node_type'] when 'HeadNode' cookbook_file "#{node['cluster']['scripts_dir']}/get_compute_user_data.py" do - source 'cfn_hup/get_compute_user_data.py' + source 'cfn_hup_configuration/get_compute_user_data.py' owner 'root' group 'root' mode '0755' @@ -84,7 +84,7 @@ when 'ComputeFleet' template "#{node['cluster']['scripts_dir']}/cfn-hup-update-action.sh" do - source "cfn_hup/#{node['cluster']['node_type']}/cfn-hup-update-action.sh.erb" + source "cfn_hup_configuration/#{node['cluster']['node_type']}/cfn-hup-update-action.sh.erb" owner 'root' group 'root' mode '0744' # TODO: Change permission diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/config_cfn_hup_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/config_cfn_hup_spec.rb deleted file mode 100644 index 0e3c372a58..0000000000 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/config_cfn_hup_spec.rb +++ /dev/null @@ -1,90 +0,0 @@ -# frozen_string_literal: true - -# Copyright:: 2024 Amazon.com, Inc. and its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the -# License. A copy of the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and -# limitations under the License. - -require "spec_helper" - -describe "aws-parallelcluster-environment::config_cfn_hup" do - AWS_REGION = "AWS_REGION" - AWS_DOMAIN = "AWS_DOMAIN" - STACK_ID = "STACK_ID" - CLOUDFORMATION_URL = "https://cloudformation.#{AWS_REGION}.#{AWS_DOMAIN}" - INSTANCE_ROLE_NAME = "INSTANCE_ROLE_NAME" - LAUNCH_TEMPLATE_ID = "LAUNCH_TEMPLATE_ID" - - for_all_oses do |platform, version| - context "on #{platform}#{version}" do - for_all_node_types do |node_type| - context "when #{node_type}" do - cached(:chef_run) do - runner = runner(platform: platform, version: version) do |node| - allow_any_instance_of(Object).to receive(:get_metadata_token).and_return("IMDS_TOKEN") - allow_any_instance_of(Object).to receive(:get_metadata_with_token) - .with("IMDS_TOKEN", URI("http://169.254.169.254/latest/meta-data/iam/security-credentials")) - .and_return(INSTANCE_ROLE_NAME) - - node.override["cluster"]["node_type"] = node_type - node.override["cluster"]["region"] = AWS_REGION - node.override["cluster"]["aws_domain"] = AWS_DOMAIN - # TODO: We inject the stack id into the attribute stack_arn when generating the dna.json in the CLI. - # This should be fixed at the CLI level first and adapt the cookbook accordingly. - node.override["cluster"]["stack_arn"] = STACK_ID - node.override["cluster"]["launch_template_id"] = LAUNCH_TEMPLATE_ID - end - runner.converge(described_recipe) - end - cached(:node) { chef_run.node } - - %w(/etc/cfn /etc/cfn/hooks.d).each do |dir| - it "creates the directory #{dir}" do - is_expected.to create_directory(dir).with( - owner: "root", - group: "root", - mode: "0770", - recursive: true - ) - end - end - - it "creates the file /etc/cfn/cfn-hup.conf" do - is_expected.to create_template("/etc/cfn/cfn-hup.conf") - .with(source: 'cfn_bootstrap/cfn-hup.conf.erb') - .with(user: "root") - .with(group: "root") - .with(mode: "0400") - .with(variables: { - stack_id: STACK_ID, - region: AWS_REGION, - cloudformation_url: CLOUDFORMATION_URL, - cfn_init_role: INSTANCE_ROLE_NAME, - }) - end - - it "creates the file /etc/cfn/hooks.d/pcluster-update.conf" do - is_expected.to create_template("/etc/cfn/hooks.d/pcluster-update.conf") - .with(source: 'cfn_bootstrap/cfn-hook-update.conf.erb') - .with(user: "root") - .with(group: "root") - .with(mode: "0400") - .with(variables: { - stack_id: STACK_ID, - region: AWS_REGION, - cloudformation_url: CLOUDFORMATION_URL, - cfn_init_role: INSTANCE_ROLE_NAME, - launch_template_resource_id: LAUNCH_TEMPLATE_ID, - }) - end - end - end - end - end -end diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/config_cfn_hup_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/cfn_hup_configuration_spec.rb similarity index 93% rename from cookbooks/aws-parallelcluster-environment/spec/unit/resources/config_cfn_hup_spec.rb rename to cookbooks/aws-parallelcluster-environment/spec/unit/resources/cfn_hup_configuration_spec.rb index a1613c7fb8..86477ca056 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/config_cfn_hup_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/cfn_hup_configuration_spec.rb @@ -56,7 +56,7 @@ def self.configure(chef_run) it "creates the file /etc/cfn/cfn-hup.conf" do is_expected.to create_template("/etc/cfn/cfn-hup.conf") - .with(source: 'cfn_hup/cfn-hup.conf.erb') + .with(source: 'cfn_hup_configuration/cfn-hup.conf.erb') .with(user: "root") .with(group: "root") .with(mode: "0400") @@ -70,7 +70,7 @@ def self.configure(chef_run) it "creates the file /etc/cfn/hooks.d/pcluster-update.conf" do is_expected.to create_template("/etc/cfn/hooks.d/pcluster-update.conf") - .with(source: 'cfn_hup/cfn-hook-update.conf.erb') + .with(source: 'cfn_hup_configuration/cfn-hook-update.conf.erb') .with(user: "root") .with(group: "root") .with(mode: "0400") @@ -87,7 +87,7 @@ def self.configure(chef_run) if %(ComputeFleet).include?(node_type) it "creates the file #{SCRIPT_DIR}/cfn-hup-update-action.sh" do is_expected.to create_template("#{SCRIPT_DIR}/cfn-hup-update-action.sh") - .with(source: "cfn_hup/#{node_type}/cfn-hup-update-action.sh.erb") + .with(source: "cfn_hup_configuration/#{node_type}/cfn-hup-update-action.sh.erb") .with(user: "root") .with(group: "root") .with(mode: "0744") @@ -99,7 +99,7 @@ def self.configure(chef_run) elsif node_type == 'HeadNode' it "creates #{SCRIPT_DIR}/get_compute_user_data.py" do is_expected.to create_if_missing_cookbook_file("#{SCRIPT_DIR}/get_compute_user_data.py") - .with(source: 'cfn_hup/get_compute_user_data.py') + .with(source: 'cfn_hup_configuration/get_compute_user_data.py') .with(user: 'root') .with(group: 'root') .with(mode: '0755') diff --git a/cookbooks/aws-parallelcluster-environment/templates/cfn_hup/ComputeFleet/cfn-hup-update-action.sh.erb b/cookbooks/aws-parallelcluster-environment/templates/cfn_hup_configuration/ComputeFleet/cfn-hup-update-action.sh.erb similarity index 100% rename from cookbooks/aws-parallelcluster-environment/templates/cfn_hup/ComputeFleet/cfn-hup-update-action.sh.erb rename to cookbooks/aws-parallelcluster-environment/templates/cfn_hup_configuration/ComputeFleet/cfn-hup-update-action.sh.erb diff --git a/cookbooks/aws-parallelcluster-environment/templates/cfn_hup/cfn-hook-update.conf.erb b/cookbooks/aws-parallelcluster-environment/templates/cfn_hup_configuration/cfn-hook-update.conf.erb similarity index 100% rename from cookbooks/aws-parallelcluster-environment/templates/cfn_hup/cfn-hook-update.conf.erb rename to cookbooks/aws-parallelcluster-environment/templates/cfn_hup_configuration/cfn-hook-update.conf.erb diff --git a/cookbooks/aws-parallelcluster-environment/templates/cfn_hup/cfn-hup-runner.sh.erb b/cookbooks/aws-parallelcluster-environment/templates/cfn_hup_configuration/cfn-hup-runner.sh.erb similarity index 100% rename from cookbooks/aws-parallelcluster-environment/templates/cfn_hup/cfn-hup-runner.sh.erb rename to cookbooks/aws-parallelcluster-environment/templates/cfn_hup_configuration/cfn-hup-runner.sh.erb diff --git a/cookbooks/aws-parallelcluster-environment/templates/cfn_hup/cfn-hup.conf.erb b/cookbooks/aws-parallelcluster-environment/templates/cfn_hup_configuration/cfn-hup.conf.erb similarity index 100% rename from cookbooks/aws-parallelcluster-environment/templates/cfn_hup/cfn-hup.conf.erb rename to cookbooks/aws-parallelcluster-environment/templates/cfn_hup_configuration/cfn-hup.conf.erb diff --git a/cookbooks/aws-parallelcluster-platform/recipes/update.rb b/cookbooks/aws-parallelcluster-platform/recipes/update.rb index c8822f66cf..f568b1067c 100644 --- a/cookbooks/aws-parallelcluster-platform/recipes/update.rb +++ b/cookbooks/aws-parallelcluster-platform/recipes/update.rb @@ -15,6 +15,7 @@ # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and # limitations under the License. +fetch_dna_files "Fetch ComputeFleet's Dna files" fetch_config 'Fetch and load cluster configs' do update true end diff --git a/cookbooks/aws-parallelcluster-platform/resources/fetch_dna_files.rb b/cookbooks/aws-parallelcluster-platform/resources/fetch_dna_files.rb new file mode 100644 index 0000000000..eda5f0779c --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/resources/fetch_dna_files.rb @@ -0,0 +1,37 @@ +# frozen_string_literal: true + +resource_name :fetch_dna_files +provides :fetch_dna_files +unified_mode true + +default_action :share + +action :share do + return if on_docker? + return unless node['cluster']['node_type'] == 'HeadNode' + + Chef::Log.info("Share extra.json with ComputeFleet") + ::FileUtils.cp_r('/tmp/extra.json', "#{node['cluster']['shared_dir']}/dna/extra.json",remove_destination: true) + + execute "Share dna.json with ComputeFleet" do + command "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/get_compute_user_data.py" \ + " --region #{node['cluster']['region']}" + timeout 30 + retries 10 + retry_delay 90 + end + +end + +action :cleanup do + return if on_docker? + return unless node['cluster']['node_type'] == 'HeadNode' + + execute "Cleanup dna.json and extra.json from #{node['cluster']['shared_dir']}/dna" do + command "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/get_compute_user_data.py" \ + " --region #{node['cluster']['region']} --cleanup" + timeout 30 + retries 10 + retry_delay 90 + end +end diff --git a/cookbooks/aws-parallelcluster-slurm/Berksfile b/cookbooks/aws-parallelcluster-slurm/Berksfile index 8c9bf7fb1e..dcbf3cf120 100644 --- a/cookbooks/aws-parallelcluster-slurm/Berksfile +++ b/cookbooks/aws-parallelcluster-slurm/Berksfile @@ -5,6 +5,7 @@ metadata cookbook "aws-parallelcluster-computefleet", path: "../aws-parallelcluster-computefleet" cookbook "aws-parallelcluster-environment", path: "../aws-parallelcluster-environment" cookbook "aws-parallelcluster-shared", path: "../aws-parallelcluster-shared" +cookbook "aws-parallelcluster-platform", path: "../aws-parallelcluster-platform" cookbook "iptables", path: "../third-party/iptables-8.0.0" cookbook "line", path: "../third-party/line-4.5.21" diff --git a/cookbooks/aws-parallelcluster-slurm/metadata.rb b/cookbooks/aws-parallelcluster-slurm/metadata.rb index 31311ddf2f..4f3f227988 100644 --- a/cookbooks/aws-parallelcluster-slurm/metadata.rb +++ b/cookbooks/aws-parallelcluster-slurm/metadata.rb @@ -18,3 +18,4 @@ depends 'aws-parallelcluster-computefleet', '~> 3.13.0' depends 'aws-parallelcluster-environment', '~> 3.13.0' depends 'aws-parallelcluster-shared', '~> 3.13.0' +depends 'aws-parallelcluster-platform', '~> 3.13.0' diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb b/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb index f212c23260..b6ce642b93 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb @@ -282,10 +282,6 @@ def update_nodes_in_queue(strategy, queues) mode '0644' end -execute "Cleanup DNA files" do - command "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/get_compute_user_data.py" \ - " --region #{node['cluster']['region']} --cleanup" - timeout 30 - retries 10 - retry_delay 90 +fetch_dna_files 'Cleanup' do + action :cleanup end \ No newline at end of file From a644f17d39e457a762401f94a6f55e1de4eca2e4 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Thu, 6 Feb 2025 18:46:42 -0500 Subject: [PATCH 04/12] Remove usage of get_compute_user_data.py from fetch_config resource --- .../resources/fetch_config.rb | 19 ------------------- .../resources/fetch_dna_files.rb | 5 +++-- 2 files changed, 3 insertions(+), 21 deletions(-) diff --git a/cookbooks/aws-parallelcluster-platform/resources/fetch_config.rb b/cookbooks/aws-parallelcluster-platform/resources/fetch_config.rb index c4b674d8c4..0f2b492ad0 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fetch_config.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fetch_config.rb @@ -9,23 +9,6 @@ default_action :run - -action :share_dna_files do - return if on_docker? - - Chef::Log.info("Share extra.json with all nodes") - ::FileUtils.cp_r('/tmp/extra.json', "#{node['cluster']['shared_dir']}/dna/extra.json",remove_destination: true) - ::FileUtils.cp_r('/tmp/extra.json', "#{node['cluster']['shared_dir_login_nodes']}/dna/extra.json",remove_destination: true) - - execute "Share DNA files" do - command "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/get_compute_user_data.py" \ - " --region #{node['cluster']['region']}" - timeout 30 - retries 10 - retry_delay 90 - end -end - action :run do return if on_docker? Chef::Log.debug("Called fetch_config with update (#{new_resource.update})") @@ -36,8 +19,6 @@ case node['cluster']['node_type'] when 'HeadNode' if new_resource.update - action_share_dna_files - Chef::Log.info("Backing up old configuration from (#{node['cluster']['cluster_config_path']}) to (#{node['cluster']['previous_cluster_config_path']})") ::FileUtils.cp_r(node['cluster']['cluster_config_path'], node['cluster']['previous_cluster_config_path'], remove_destination: true) fetch_cluster_config(node['cluster']['cluster_config_path']) diff --git a/cookbooks/aws-parallelcluster-platform/resources/fetch_dna_files.rb b/cookbooks/aws-parallelcluster-platform/resources/fetch_dna_files.rb index eda5f0779c..292528c16b 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fetch_dna_files.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fetch_dna_files.rb @@ -4,6 +4,8 @@ provides :fetch_dna_files unified_mode true +property :extra_chef_attribute_location, String, default: '/tmp/extra.json' + default_action :share action :share do @@ -11,7 +13,7 @@ return unless node['cluster']['node_type'] == 'HeadNode' Chef::Log.info("Share extra.json with ComputeFleet") - ::FileUtils.cp_r('/tmp/extra.json', "#{node['cluster']['shared_dir']}/dna/extra.json",remove_destination: true) + ::FileUtils.cp_r(new_resource.extra_chef_attribute_location, "#{node['cluster']['shared_dir']}/dna/extra.json", remove_destination: true) if ::File.exist?(new_resource.extra_chef_attribute_location) execute "Share dna.json with ComputeFleet" do command "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/get_compute_user_data.py" \ @@ -20,7 +22,6 @@ retries 10 retry_delay 90 end - end action :cleanup do From 879de41f2b7783dba57bc782b6613eab06d1c0b7 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Fri, 7 Feb 2025 10:44:40 -0500 Subject: [PATCH 05/12] Remove usage of Login Node from get_compute_user_data.py --- .../get_compute_user_data.py | 109 ++++++++++++------ 1 file changed, 72 insertions(+), 37 deletions(-) diff --git a/cookbooks/aws-parallelcluster-environment/files/cfn_hup_configuration/get_compute_user_data.py b/cookbooks/aws-parallelcluster-environment/files/cfn_hup_configuration/get_compute_user_data.py index 7a116d1098..985723c159 100644 --- a/cookbooks/aws-parallelcluster-environment/files/cfn_hup_configuration/get_compute_user_data.py +++ b/cookbooks/aws-parallelcluster-environment/files/cfn_hup_configuration/get_compute_user_data.py @@ -15,34 +15,37 @@ import argparse from email import message_from_string import json -import mimetypes import os import boto3 import yaml import base64 +import logging +from retrying import retry SHARED_LOCATION = "/opt/parallelcluster/" COMPUTE_FLEET_SHARED_LOCATION = SHARED_LOCATION + 'shared/' -LOGIN_POOL_SHARED_LOCATION = SHARED_LOCATION + 'shared_login_nodes/' COMPUTE_FLEET_DNA_LOC = COMPUTE_FLEET_SHARED_LOCATION + 'dna/' -LOGIN_POOL_DNA_LOC = LOGIN_POOL_SHARED_LOCATION + 'dna/' COMPUTE_FLEET_LAUNCH_TEMPLATE_ID = COMPUTE_FLEET_SHARED_LOCATION + 'launch-templates-config.json' -LOGIN_POOL_LAUNCH_TEMPLATE_ID = LOGIN_POOL_SHARED_LOCATION + 'launch-templates-config.json' +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) +def get_compute_launch_template_ids(shared_storage): + """Load launch-templates-config.json which contains ID, Version number and Logical ID of all queues in Compute Fleet's Launch Template.""" + try: + with open(shared_storage, 'r') as file: + lt_config = json.loads(file.read()) + return lt_config + except Exception as err: + logger.warn("Unable to read %s due to %s", shared_storage, err) -def get_launch_template_details(shared_storage): - with open(shared_storage, 'r') as file: - lt_config = json.loads(file.read()) - return lt_config - - -def get_compute_launch_template_ids(args): - lt_config = get_launch_template_details(COMPUTE_FLEET_LAUNCH_TEMPLATE_ID) +def create_dna_files(args): + """Creates all dna.json for each queue in cluster.""" + lt_config = get_compute_launch_template_ids(COMPUTE_FLEET_LAUNCH_TEMPLATE_ID) if lt_config: all_queues = lt_config.get('Queues') for _, queues in all_queues.items(): @@ -51,15 +54,15 @@ def get_compute_launch_template_ids(args): get_latest_dns_data(compute_res, COMPUTE_FLEET_DNA_LOC, args) -def get_login_pool_launch_template_ids(args): - lt_config = get_launch_template_details(LOGIN_POOL_LAUNCH_TEMPLATE_ID) - if lt_config: - login_pools = lt_config.get('LoginPools') - for _, pool in login_pools.items(): - get_latest_dns_data(pool, LOGIN_POOL_DNA_LOC, args) - - +@retry(stop_max_attempt_number=5, wait_fixed=3000) def get_user_data(lt_id, lt_version, region_name): + """ + Calls EC2 DescribeLaunchTemplateVersions API to get UserData from Launch Template specified. + :param lt_id: Launch Template ID (eg: lt-12345678901234567) + :param lt_version: Launch Template latest Version Number (eg: 2) + :param region_name: AWS region name (eg: us-east-1) + :return: User_data in MIME format + """ try: ec2_client = boto3.client("ec2", region_name=region_name) response = ec2_client.describe_launch_template_versions( @@ -70,11 +73,19 @@ def get_user_data(lt_id, lt_version, region_name): ).get('LaunchTemplateVersions') decoded_data = base64.b64decode(response[0]['LaunchTemplateData']['UserData'], validate=True).decode('utf-8') return decoded_data - except Exception as e: # binascii.Error: - print("Exception raised", e) + except Exception as err: + if hasattr(err, "message"): + err = err.message + logger.error( + "Unable to get UserData for launch template%s with version %s.\nException: %s", + lt_id, lt_version, err + ) def parse_mime_user_data(user_data): + """ + Parses MIME formatted UserData that we get from EC2 to extract write_files section from cloud-config section. + """ data = message_from_string(user_data) for cloud_config_section in data.walk(): if cloud_config_section.get_content_type() == 'text/cloud-config': @@ -84,29 +95,49 @@ def parse_mime_user_data(user_data): def write_dna_files(write_files_section, shared_storage_loc): - for data in write_files_section: - if data['path'] in ['/tmp/dna.json']: - with open(shared_storage_loc+"-dna.json" ,"w") as file: - file.write(json.dumps(json.loads(data['content']),indent=4)) - + """ + Writes the dna.json in shared location after extracting it from write_files section of UserData. + :param write_files_section: Entire write_files section from UserData + :param shared_storage_loc: Shared Storage Location of where to write dna.json + :return: None + """ + try: + file_path = shared_storage_loc+"-dna.json" + for data in write_files_section: + if data['path'] in ['/tmp/dna.json']: + with open(file_path,"w") as file: + file.write(json.dumps(json.loads(data['content']),indent=4)) + except Exception as err: + if hasattr(err, "message"): + err = err.message + logger.error("Unable to write %s due to %s", file_path, err) def get_latest_dns_data(resource, output_location, args): + """ + Function to get latest User Data, extract relevant details and write dna.json. + :param resource: Resource containing LT ID, Version and Logical id + :param output_location: Shared Storage Location were we want to write dna.json + :param args: Command Line arguments + :rtype: None + """ user_data = get_user_data(resource.get('LaunchTemplate').get('Id'), resource.get('LaunchTemplate').get('Version'), args.region) write_directives = parse_mime_user_data(user_data) write_dna_files(write_directives, output_location+resource.get('LaunchTemplate').get("LogicalId")) def cleanup(directory_loc): + """Cleanup dna.json and extra.json files.""" for f in os.listdir(directory_loc): f_path = os.path.join(directory_loc, f) try: if os.path.isfile(f_path): os.remove(f_path) - except Exception as e: - print(f"Error deleting {f_path}: {e}") + except Exception as err: + logger.warn(f"Unable to delete %s due to %s", f_path, err) def _parse_cli_args(): + """Parse command line args.""" parser = argparse.ArgumentParser( - description="Get latest User Data from Compute and Login Node Launch Templates.", exit_on_error=False + description="Get latest User Data from ComputeFleet Launch Templates.", exit_on_error=False ) parser.add_argument( @@ -133,13 +164,17 @@ def _parse_cli_args(): def main(): - args = _parse_cli_args() - if args.cleanup: - cleanup(COMPUTE_FLEET_DNA_LOC) - cleanup(LOGIN_POOL_DNA_LOC) - else: - get_compute_launch_template_ids(args) - #get_login_pool_launch_template_ids(args) + try: + args = _parse_cli_args() + if args.cleanup: + cleanup(COMPUTE_FLEET_DNA_LOC) + else: + create_dna_files(args) + except Exception as err: + if hasattr(err, "message"): + err = err.message + logger.exception("Encountered exception when fetching latest dna.json for ComputeFleet, exiting gracefully: %s", err) + raise SystemExit(0) if __name__ == "__main__": From 5498c500b22d2d18d7d5eef65b104acd640bcf3e Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Mon, 10 Feb 2025 10:46:51 -0500 Subject: [PATCH 06/12] [Unit Test] Unit test for fetch_dna_files resource --- .../unit/resources/fetch_dna_files_spec.rb | 78 +++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 cookbooks/aws-parallelcluster-platform/spec/unit/resources/fetch_dna_files_spec.rb diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fetch_dna_files_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fetch_dna_files_spec.rb new file mode 100644 index 0000000000..7a956fa166 --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fetch_dna_files_spec.rb @@ -0,0 +1,78 @@ +require 'spec_helper' + +class ConvergeFetchDnaFiles + def self.share(chef_run, extra_chef_attribute_location: nil) + chef_run.converge_dsl('aws-parallelcluster-platform') do + fetch_dna_files 'share' do + extra_chef_attribute_location extra_chef_attribute_location + action :share + end + end + end + + def self.cleanup(chef_run) + chef_run.converge_dsl('aws-parallelcluster-platform') do + fetch_dna_files 'cleanup' do + action :cleanup + end + end + end +end + +describe 'fetch_dna_files resource' do + for_all_oses do |platform, version| + context "on #{platform}#{version}" do + cached(:script_dir) { 'SCRIPT_DIR' } + cached(:shared_dir) { 'SHARED_DIR' } + cached(:region) { 'REGION' } + + context "when we share dna files" do + cached(:chef_run) do + runner = runner(platform: platform, version: version, step_into: ['fetch_dna_files']) do |node| + node.override['cluster']['scripts_dir'] = script_dir + node.override['cluster']['shared_dir'] = shared_dir + node.override['cluster']['node_type'] = 'HeadNode' + node.override['cluster']['region'] = region + node.override['kitchen'] = true + end + ConvergeFetchDnaFiles.share(runner, extra_chef_attribute_location: "#{kitchen_instance_types_data_path}") + end + cached(:node) { chef_run.node } + + # it "it copies data from /tmp/extra.json" do + # is_expected.to create_remote_file("copy extra.json") + # .with(path: "#{shared_dir}/dna/extra.json") + # .with(source: "file://#{kitchen_instance_types_data_path}") + # end + + it 'runs get_compute_user_data.py to get dna files' do + is_expected.to run_execute('Share dna.json with ComputeFleet').with( + command: "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/get_compute_user_data.py" \ + " --region #{node['cluster']['region']}" + ) + end + end + + context "when we cleanup dna files" do + cached(:chef_run) do + runner = runner(platform: platform, version: version, step_into: ['fetch_dna_files']) do |node| + node.override['cluster']['scripts_dir'] = script_dir + node.override['cluster']['shared_dir'] = shared_dir + node.override['cluster']['node_type'] = 'HeadNode' + node.override['cluster']['region'] = region + end + allow_any_instance_of(Object).to receive(:aws_domain).and_return(aws_domain) + ConvergeFetchDnaFiles.cleanup(runner) + end + cached(:node) { chef_run.node } + + it 'cleanups dna files' do + is_expected.to run_execute("Cleanup dna.json and extra.json from #{node['cluster']['shared_dir']}/dna").with( + command: "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/get_compute_user_data.py" \ + " --region #{node['cluster']['region']} --cleanup" + ) + end + end + end + end +end From 2eff92541f6f004aad014f34067025b1c7feacc1 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Mon, 10 Feb 2025 10:57:41 -0500 Subject: [PATCH 07/12] [Kitchen Test] Test for cfn_hup_configuration resource --- .../kitchen.environment-config.yml | 14 +++++ .../controls/cfn_hup_configuration_spec.rb | 60 +++++++++++++++++++ 2 files changed, 74 insertions(+) create mode 100644 cookbooks/aws-parallelcluster-environment/test/controls/cfn_hup_configuration_spec.rb diff --git a/cookbooks/aws-parallelcluster-environment/kitchen.environment-config.yml b/cookbooks/aws-parallelcluster-environment/kitchen.environment-config.yml index 9e6f5eb351..ac4053c36b 100644 --- a/cookbooks/aws-parallelcluster-environment/kitchen.environment-config.yml +++ b/cookbooks/aws-parallelcluster-environment/kitchen.environment-config.yml @@ -314,6 +314,20 @@ suites: directory_service: enabled: "true" node_type: HeadNode + - name: cfn_hup_configuration + run_list: + - recipe[aws-parallelcluster-tests::setup] + - recipe[aws-parallelcluster-tests::test_resource] + verifier: + controls: + - /tag:config_cfn_hup/ + attributes: + resource: cfn_hup_configuration:configure + dependencies: + - recipe:aws-parallelcluster-platform::directories + cluster: + node_type: HeadNode + stack_arn: 'test' # Recipes - name: cfnconfig_mixed diff --git a/cookbooks/aws-parallelcluster-environment/test/controls/cfn_hup_configuration_spec.rb b/cookbooks/aws-parallelcluster-environment/test/controls/cfn_hup_configuration_spec.rb new file mode 100644 index 0000000000..b9bfc9e78a --- /dev/null +++ b/cookbooks/aws-parallelcluster-environment/test/controls/cfn_hup_configuration_spec.rb @@ -0,0 +1,60 @@ +# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +control 'tag:config_cfn_hup_conf_files_created' do + title "cfn_hup configuration files and directories should be created" + + %w(/etc/cfn /etc/cfn/hooks.d).each do |dir| + describe directory(dir) do + it { should exist } + its('mode') { should cmp '0770' } + its('owner') { should eq 'root' } + its('group') { should eq 'root' } + end + end + + %w(/etc/cfn/cfn-hup.conf /etc/cfn/hooks.d/pcluster-update.conf).each do |conf_file| + describe file(conf_file) do + it { should exist } + its('mode') { should cmp '0400' } + its('owner') { should eq 'root' } + its('group') { should eq 'root' } + end + end +end + +control 'tag:config_cfn_hup_head_node_configuration' do + title "cfn_hup configuration files and directories for HeadNode should be created" + only_if { instance.head_node? } + + describe file("#{node['cluster']['scripts_dir']}/get_compute_user_data.py") do + it { should exist } + its('mode') { should cmp '0400' } + its('owner') { should eq 'root' } + its('group') { should eq 'root' } + end + + describe directory("#{node['cluster']['base_dir']}/dna") do + it { should exist } + end +end + +control 'tag:config_cfn_hup_compute_configuration' do + title "cfn_hup configuration files and directories for ComputeFleet should be created" + only_if { instance.compute_node? } + + describe file("#{node['cluster']['scripts_dir']}/cfn-hup-update-action.sh") do + it { should exist } + its('mode') { should cmp '0744' } + its('owner') { should eq 'root' } + its('group') { should eq 'root' } + end +end From ba1e0741b9da6cb34bc3489425167bc4901a0880 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Mon, 10 Feb 2025 11:12:45 -0500 Subject: [PATCH 08/12] Make cfn-hup-update-action.sh executable Only by root --- .../resources/cfn_hup_configuration.rb | 2 +- .../spec/unit/resources/cfn_hup_configuration_spec.rb | 2 +- .../test/controls/cfn_hup_configuration_spec.rb | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cookbooks/aws-parallelcluster-environment/resources/cfn_hup_configuration.rb b/cookbooks/aws-parallelcluster-environment/resources/cfn_hup_configuration.rb index e09d6e87fe..be77113e22 100644 --- a/cookbooks/aws-parallelcluster-environment/resources/cfn_hup_configuration.rb +++ b/cookbooks/aws-parallelcluster-environment/resources/cfn_hup_configuration.rb @@ -87,7 +87,7 @@ source "cfn_hup_configuration/#{node['cluster']['node_type']}/cfn-hup-update-action.sh.erb" owner 'root' group 'root' - mode '0744' # TODO: Change permission + mode '0700' variables( monitor_shared_dir: monitor_shared_dir, launch_template_resource_id: node['cluster']['launch_template_id'] diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/cfn_hup_configuration_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/cfn_hup_configuration_spec.rb index 86477ca056..e1e28595aa 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/cfn_hup_configuration_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/cfn_hup_configuration_spec.rb @@ -90,7 +90,7 @@ def self.configure(chef_run) .with(source: "cfn_hup_configuration/#{node_type}/cfn-hup-update-action.sh.erb") .with(user: "root") .with(group: "root") - .with(mode: "0744") + .with(mode: "0700") .with(variables: { monitor_shared_dir: "#{MONITOR_SHARED_DIR}/dna", launch_template_resource_id: LAUNCH_TEMPLATE_ID, diff --git a/cookbooks/aws-parallelcluster-environment/test/controls/cfn_hup_configuration_spec.rb b/cookbooks/aws-parallelcluster-environment/test/controls/cfn_hup_configuration_spec.rb index b9bfc9e78a..3b0bba47f5 100644 --- a/cookbooks/aws-parallelcluster-environment/test/controls/cfn_hup_configuration_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/test/controls/cfn_hup_configuration_spec.rb @@ -53,7 +53,7 @@ describe file("#{node['cluster']['scripts_dir']}/cfn-hup-update-action.sh") do it { should exist } - its('mode') { should cmp '0744' } + its('mode') { should cmp '0700' } its('owner') { should eq 'root' } its('group') { should eq 'root' } end From c8d8cb6cc80a91c6827365958f1a02287b1bdf21 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Mon, 10 Feb 2025 11:25:25 -0500 Subject: [PATCH 09/12] Explicit return statement at end of the function --- .../files/cfn_hup_configuration/get_compute_user_data.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cookbooks/aws-parallelcluster-environment/files/cfn_hup_configuration/get_compute_user_data.py b/cookbooks/aws-parallelcluster-environment/files/cfn_hup_configuration/get_compute_user_data.py index 985723c159..50fd77916b 100644 --- a/cookbooks/aws-parallelcluster-environment/files/cfn_hup_configuration/get_compute_user_data.py +++ b/cookbooks/aws-parallelcluster-environment/files/cfn_hup_configuration/get_compute_user_data.py @@ -38,10 +38,12 @@ def get_compute_launch_template_ids(shared_storage): try: with open(shared_storage, 'r') as file: lt_config = json.loads(file.read()) - return lt_config except Exception as err: logger.warn("Unable to read %s due to %s", shared_storage, err) + return lt_config + + def create_dna_files(args): """Creates all dna.json for each queue in cluster.""" @@ -72,7 +74,6 @@ def get_user_data(lt_id, lt_version, region_name): ], ).get('LaunchTemplateVersions') decoded_data = base64.b64decode(response[0]['LaunchTemplateData']['UserData'], validate=True).decode('utf-8') - return decoded_data except Exception as err: if hasattr(err, "message"): err = err.message @@ -81,6 +82,9 @@ def get_user_data(lt_id, lt_version, region_name): lt_id, lt_version, err ) + return decoded_data + + def parse_mime_user_data(user_data): """ From ea3c56674b30024acd00da925d24e0757d619f07 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Mon, 17 Feb 2025 14:48:44 -0500 Subject: [PATCH 10/12] Change permission for cfn-hup files and directories for only root user access * Add Proxy if being used. --- ...ser_data.py => share_compute_fleet_dna.py} | 107 +++++++++++++----- .../recipes/config.rb | 2 +- .../resources/cfn_hup_configuration.rb | 15 +-- .../resources/cfn_hup_configuration_spec.rb | 13 ++- .../ComputeFleet/cfn-hup-update-action.sh.erb | 8 +- .../cfn-hook-update.conf.erb | 2 +- .../controls/cfn_hup_configuration_spec.rb | 8 +- .../resources/fetch_dna_files.rb | 18 ++- .../unit/resources/fetch_dna_files_spec.rb | 6 +- 9 files changed, 122 insertions(+), 57 deletions(-) rename cookbooks/aws-parallelcluster-environment/files/cfn_hup_configuration/{get_compute_user_data.py => share_compute_fleet_dna.py} (62%) diff --git a/cookbooks/aws-parallelcluster-environment/files/cfn_hup_configuration/get_compute_user_data.py b/cookbooks/aws-parallelcluster-environment/files/cfn_hup_configuration/share_compute_fleet_dna.py similarity index 62% rename from cookbooks/aws-parallelcluster-environment/files/cfn_hup_configuration/get_compute_user_data.py rename to cookbooks/aws-parallelcluster-environment/files/cfn_hup_configuration/share_compute_fleet_dna.py index 50fd77916b..ac67af520e 100644 --- a/cookbooks/aws-parallelcluster-environment/files/cfn_hup_configuration/get_compute_user_data.py +++ b/cookbooks/aws-parallelcluster-environment/files/cfn_hup_configuration/share_compute_fleet_dna.py @@ -1,4 +1,4 @@ -# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). # You may not use this file except in compliance with the @@ -11,49 +11,88 @@ # limitations under the License. - +import configparser import argparse from email import message_from_string import json import os import boto3 +from botocore.config import Config import yaml import base64 import logging from retrying import retry -SHARED_LOCATION = "/opt/parallelcluster/" - -COMPUTE_FLEET_SHARED_LOCATION = SHARED_LOCATION + 'shared/' +COMPUTE_FLEET_SHARED_LOCATION = "/opt/parallelcluster/shared/" -COMPUTE_FLEET_DNA_LOC = COMPUTE_FLEET_SHARED_LOCATION + 'dna/' +COMPUTE_FLEET_SHARED_DNA_LOCATION = COMPUTE_FLEET_SHARED_LOCATION + 'dna/' -COMPUTE_FLEET_LAUNCH_TEMPLATE_ID = COMPUTE_FLEET_SHARED_LOCATION + 'launch-templates-config.json' +COMPUTE_FLEET_LAUNCH_TEMPLATE_CONFIG = COMPUTE_FLEET_SHARED_LOCATION + 'launch-templates-config.json' logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) -def get_compute_launch_template_ids(shared_storage): - """Load launch-templates-config.json which contains ID, Version number and Logical ID of all queues in Compute Fleet's Launch Template.""" +def get_compute_launch_template_ids(lt_config_file_name): + """Load launch-templates-config.json which contains ID, Version number and Logical ID of all queues in Compute Fleet's Launch Template. + The format of launch-templates-config.json is + { + "Queues": { + "queue1": { + "ComputeResources": { + "queue1-i1": { + "LaunchTemplate": { + "Version": "1", + "LogicalId": "LaunchTemplate123456789012345", + "Id": "lt-12345678901234567" + } + } + } + }, + "queue2": { + "ComputeResources": { + "queue2-i1": { + "LaunchTemplate": { + "Version": "1", + "LogicalId": "LaunchTemplate012345678901234", + "Id": "lt-01234567890123456" + } + } + } + } + } + } + """ + lt_config = None try: - with open(shared_storage, 'r') as file: + with open(lt_config_file_name, 'r') as file: lt_config = json.loads(file.read()) except Exception as err: - logger.warn("Unable to read %s due to %s", shared_storage, err) + logger.warn("Unable to read %s due to %s", lt_config_file_name, err) return lt_config - -def create_dna_files(args): +def share_compute_fleet_dna(args): """Creates all dna.json for each queue in cluster.""" - lt_config = get_compute_launch_template_ids(COMPUTE_FLEET_LAUNCH_TEMPLATE_ID) + lt_config = get_compute_launch_template_ids(COMPUTE_FLEET_LAUNCH_TEMPLATE_CONFIG) if lt_config: all_queues = lt_config.get('Queues') for _, queues in all_queues.items(): compute_resources = queues.get('ComputeResources') for _, compute_res in compute_resources.items(): - get_latest_dns_data(compute_res, COMPUTE_FLEET_DNA_LOC, args) + get_latest_dna_data(compute_res, COMPUTE_FLEET_SHARED_DNA_LOCATION, args) + + +# FIXME: Fix Code Duplication +def parse_proxy_config(): + config = configparser.RawConfigParser() + config.read("/etc/boto.cfg") + proxy_config = Config() + if config.has_option("Boto", "proxy") and config.has_option("Boto", "proxy_port"): + proxy = config.get("Boto", "proxy") + proxy_port = config.get("Boto", "proxy_port") + proxy_config = Config(proxies={"https": f"{proxy}:{proxy_port}"}) + return proxy_config @retry(stop_max_attempt_number=5, wait_fixed=3000) @@ -65,8 +104,11 @@ def get_user_data(lt_id, lt_version, region_name): :param region_name: AWS region name (eg: us-east-1) :return: User_data in MIME format """ + decoded_data = None try: - ec2_client = boto3.client("ec2", region_name=region_name) + proxy_config = parse_proxy_config() + + ec2_client = boto3.client("ec2", region_name=region_name, config=proxy_config) response = ec2_client.describe_launch_template_versions( LaunchTemplateId= lt_id, Versions=[ @@ -78,23 +120,25 @@ def get_user_data(lt_id, lt_version, region_name): if hasattr(err, "message"): err = err.message logger.error( - "Unable to get UserData for launch template%s with version %s.\nException: %s", + "Unable to get UserData for launch template %s with version %s.\nException: %s", lt_id, lt_version, err ) return decoded_data - -def parse_mime_user_data(user_data): +def get_write_directives_section(user_data): """ Parses MIME formatted UserData that we get from EC2 to extract write_files section from cloud-config section. """ - data = message_from_string(user_data) - for cloud_config_section in data.walk(): - if cloud_config_section.get_content_type() == 'text/cloud-config': - write_directives_section = yaml.safe_load(cloud_config_section._payload).get('write_files') - + write_directives_section = None + try: + data = message_from_string(user_data) + for cloud_config_section in data.walk(): + if cloud_config_section.get_content_type() == 'text/cloud-config': + write_directives_section = yaml.safe_load(cloud_config_section._payload).get('write_files') + except Exception as err: + logger.error("Error occurred while parsing write_files section.\nException: %s", err) return write_directives_section @@ -116,7 +160,8 @@ def write_dna_files(write_files_section, shared_storage_loc): err = err.message logger.error("Unable to write %s due to %s", file_path, err) -def get_latest_dns_data(resource, output_location, args): + +def get_latest_dna_data(resource, output_location, args): """ Function to get latest User Data, extract relevant details and write dna.json. :param resource: Resource containing LT ID, Version and Logical id @@ -125,8 +170,10 @@ def get_latest_dns_data(resource, output_location, args): :rtype: None """ user_data = get_user_data(resource.get('LaunchTemplate').get('Id'), resource.get('LaunchTemplate').get('Version'), args.region) - write_directives = parse_mime_user_data(user_data) - write_dna_files(write_directives, output_location+resource.get('LaunchTemplate').get("LogicalId")) + if user_data: + write_directives = get_write_directives_section(user_data) + write_dna_files(write_directives, output_location+resource.get('LaunchTemplate').get("LogicalId")) + def cleanup(directory_loc): """Cleanup dna.json and extra.json files.""" @@ -138,6 +185,7 @@ def cleanup(directory_loc): except Exception as err: logger.warn(f"Unable to delete %s due to %s", f_path, err) + def _parse_cli_args(): """Parse command line args.""" parser = argparse.ArgumentParser( @@ -157,7 +205,6 @@ def _parse_cli_args(): "-c", "--cleanup", action="store_true", - default=False, required=False, help="Cleanup DNA files created", ) @@ -171,9 +218,9 @@ def main(): try: args = _parse_cli_args() if args.cleanup: - cleanup(COMPUTE_FLEET_DNA_LOC) + cleanup(COMPUTE_FLEET_SHARED_DNA_LOCATION) else: - create_dna_files(args) + share_compute_fleet_dna(args) except Exception as err: if hasattr(err, "message"): err = err.message diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config.rb b/cookbooks/aws-parallelcluster-environment/recipes/config.rb index 1326bca930..6d58292e20 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/config.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/config.rb @@ -38,4 +38,4 @@ # spack 'Configure Spack Packages' do # action :configure # end -cfn_hup_configuration "Configure Cfn-hup" +cfn_hup_configuration "Configure cfn-hup" diff --git a/cookbooks/aws-parallelcluster-environment/resources/cfn_hup_configuration.rb b/cookbooks/aws-parallelcluster-environment/resources/cfn_hup_configuration.rb index be77113e22..60ecc760af 100644 --- a/cookbooks/aws-parallelcluster-environment/resources/cfn_hup_configuration.rb +++ b/cookbooks/aws-parallelcluster-environment/resources/cfn_hup_configuration.rb @@ -1,7 +1,7 @@ # frozen_string_literal: true # -# Copyright:: 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the # License. A copy of the License is located at @@ -27,14 +27,14 @@ directory '/etc/cfn' do owner 'root' group 'root' - mode '0770' + mode '0700' recursive true end directory '/etc/cfn/hooks.d' do owner 'root' group 'root' - mode '0770' + mode '0700' recursive true end @@ -64,7 +64,8 @@ cloudformation_url: cloudformation_url, cfn_init_role: instance_role_name, launch_template_resource_id: node['cluster']['launch_template_id'], - update_hook_script_dir: node['cluster']['scripts_dir'] + update_hook_script_dir: node['cluster']['scripts_dir'], + node_bootstrap_timeout: node['cluster']['compute_node_bootstrap_timeout'] || node['cluster']['Timeout'] ) end end @@ -72,11 +73,11 @@ action :extra_configuration do case node['cluster']['node_type'] when 'HeadNode' - cookbook_file "#{node['cluster']['scripts_dir']}/get_compute_user_data.py" do - source 'cfn_hup_configuration/get_compute_user_data.py' + cookbook_file "#{node['cluster']['scripts_dir']}/share_compute_fleet_dna.py" do + source 'cfn_hup_configuration/share_compute_fleet_dna.py' owner 'root' group 'root' - mode '0755' + mode '0700' action :create_if_missing end diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/cfn_hup_configuration_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/cfn_hup_configuration_spec.rb index e1e28595aa..071aed0558 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/cfn_hup_configuration_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/cfn_hup_configuration_spec.rb @@ -18,6 +18,7 @@ def self.configure(chef_run) LAUNCH_TEMPLATE_ID = "LAUNCH_TEMPLATE_ID".freeze SCRIPT_DIR = "SCRIPT_DIR".freeze MONITOR_SHARED_DIR = "MONITOR_SHARED_DIR".freeze +NODE_BOOTSTRAP_TIMEOUT = "1800" describe 'cfn_hup_configuration:configure' do for_all_oses do |platform, version| @@ -39,6 +40,7 @@ def self.configure(chef_run) node.override["cluster"]["launch_template_id"] = LAUNCH_TEMPLATE_ID node.override['cluster']['scripts_dir'] = SCRIPT_DIR node.override['cluster']['shared_dir'] = MONITOR_SHARED_DIR + node.override['cluster']['compute_node_bootstrap_timeout'] = NODE_BOOTSTRAP_TIMEOUT end ConvergeCfnHupConfiguration.configure(runner) end @@ -49,7 +51,7 @@ def self.configure(chef_run) is_expected.to create_directory(dir) .with(owner: 'root') .with(group: 'root') - .with(mode: "0770") + .with(mode: "0700") .with(recursive: true) end end @@ -81,6 +83,7 @@ def self.configure(chef_run) cfn_init_role: INSTANCE_ROLE_NAME, launch_template_resource_id: LAUNCH_TEMPLATE_ID, update_hook_script_dir: SCRIPT_DIR, + node_bootstrap_timeout: NODE_BOOTSTRAP_TIMEOUT, }) end @@ -97,12 +100,12 @@ def self.configure(chef_run) }) end elsif node_type == 'HeadNode' - it "creates #{SCRIPT_DIR}/get_compute_user_data.py" do - is_expected.to create_if_missing_cookbook_file("#{SCRIPT_DIR}/get_compute_user_data.py") - .with(source: 'cfn_hup_configuration/get_compute_user_data.py') + it "creates #{SCRIPT_DIR}/share_compute_fleet_dna.py" do + is_expected.to create_if_missing_cookbook_file("#{SCRIPT_DIR}/share_compute_fleet_dna.py") + .with(source: 'cfn_hup_configuration/share_compute_fleet_dna.py') .with(user: 'root') .with(group: 'root') - .with(mode: '0755') + .with(mode: '0700') end it "creates the directory #{MONITOR_SHARED_DIR}/dna" do diff --git a/cookbooks/aws-parallelcluster-environment/templates/cfn_hup_configuration/ComputeFleet/cfn-hup-update-action.sh.erb b/cookbooks/aws-parallelcluster-environment/templates/cfn_hup_configuration/ComputeFleet/cfn-hup-update-action.sh.erb index b16424dbca..648603d6aa 100644 --- a/cookbooks/aws-parallelcluster-environment/templates/cfn_hup_configuration/ComputeFleet/cfn-hup-update-action.sh.erb +++ b/cookbooks/aws-parallelcluster-environment/templates/cfn_hup_configuration/ComputeFleet/cfn-hup-update-action.sh.erb @@ -1,8 +1,10 @@ #!/bin/bash set -ex - - +# This script is invoked by cfn-hup as part of its update hook action. +# This script runs on each node of ComputeFleet to monitor the shared location for latest dna.json and extra.json files and run ParallelCluster Cookbook recipes. +# +# Usage: ./cfn-hup-update-action.sh function run_cookbook_recipes() { LATEST_DNA_LOC=<%= @monitor_shared_dir %> @@ -35,7 +37,7 @@ function run_cookbook_recipes() { main() { PATH=/usr/local/bin:/bin:/usr/bin:/opt/aws/bin; . /etc/parallelcluster/pcluster_cookbook_environment.sh; - echo "We monitor <%= @monitor_shared_dir %> to check for <%= @launch_template_resource_id %>-dna.json is being added" + echo "We monitor <%= @monitor_shared_dir %> to check for <%= @launch_template_resource_id %>-dna.json has been added and run ParallelCluster cookbook recipes." run_cookbook_recipes } diff --git a/cookbooks/aws-parallelcluster-environment/templates/cfn_hup_configuration/cfn-hook-update.conf.erb b/cookbooks/aws-parallelcluster-environment/templates/cfn_hup_configuration/cfn-hook-update.conf.erb index ea18afa144..d918ab4334 100644 --- a/cookbooks/aws-parallelcluster-environment/templates/cfn_hup_configuration/cfn-hook-update.conf.erb +++ b/cookbooks/aws-parallelcluster-environment/templates/cfn_hup_configuration/cfn-hook-update.conf.erb @@ -6,6 +6,6 @@ path=Resources.<%= @launch_template_resource_id %>.Metadata.AWS::CloudFormation: action=PATH=/usr/local/bin:/bin:/usr/bin:/opt/aws/bin; . /etc/parallelcluster/pcluster_cookbook_environment.sh; $CFN_BOOTSTRAP_VIRTUALENV_PATH/cfn-init -v --stack <%= @stack_id %> --resource <%= @launch_template_resource_id %> --configsets update --region <%= @region %> --url <%= @cloudformation_url %> --role <%= @cfn_init_role %> <% when 'ComputeFleet' -%> path=Resources.<%= @launch_template_resource_id %> -action=timeout 900 <%= @update_hook_script_dir %>/cfn-hup-update-action.sh +action=timeout <%= @node_bootstrap_timeout %> <%= @update_hook_script_dir %>/cfn-hup-update-action.sh <% end %> runas=root diff --git a/cookbooks/aws-parallelcluster-environment/test/controls/cfn_hup_configuration_spec.rb b/cookbooks/aws-parallelcluster-environment/test/controls/cfn_hup_configuration_spec.rb index 3b0bba47f5..54170f695c 100644 --- a/cookbooks/aws-parallelcluster-environment/test/controls/cfn_hup_configuration_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/test/controls/cfn_hup_configuration_spec.rb @@ -1,4 +1,4 @@ -# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). # You may not use this file except in compliance with the License. A copy of the License is located at @@ -15,7 +15,7 @@ %w(/etc/cfn /etc/cfn/hooks.d).each do |dir| describe directory(dir) do it { should exist } - its('mode') { should cmp '0770' } + its('mode') { should cmp '0700' } its('owner') { should eq 'root' } its('group') { should eq 'root' } end @@ -35,9 +35,9 @@ title "cfn_hup configuration files and directories for HeadNode should be created" only_if { instance.head_node? } - describe file("#{node['cluster']['scripts_dir']}/get_compute_user_data.py") do + describe file("#{node['cluster']['scripts_dir']}/share_compute_fleet_dna.py") do it { should exist } - its('mode') { should cmp '0400' } + its('mode') { should cmp '0700' } its('owner') { should eq 'root' } its('group') { should eq 'root' } end diff --git a/cookbooks/aws-parallelcluster-platform/resources/fetch_dna_files.rb b/cookbooks/aws-parallelcluster-platform/resources/fetch_dna_files.rb index 292528c16b..c3c4c5e15b 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fetch_dna_files.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fetch_dna_files.rb @@ -1,5 +1,17 @@ # frozen_string_literal: true +# +# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + resource_name :fetch_dna_files provides :fetch_dna_files unified_mode true @@ -15,8 +27,8 @@ Chef::Log.info("Share extra.json with ComputeFleet") ::FileUtils.cp_r(new_resource.extra_chef_attribute_location, "#{node['cluster']['shared_dir']}/dna/extra.json", remove_destination: true) if ::File.exist?(new_resource.extra_chef_attribute_location) - execute "Share dna.json with ComputeFleet" do - command "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/get_compute_user_data.py" \ + execute "Run share_compute_fleet_dna.py to get user_data.sh and share dna.json with ComputeFleet" do + command "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/share_compute_fleet_dna.py" \ " --region #{node['cluster']['region']}" timeout 30 retries 10 @@ -29,7 +41,7 @@ return unless node['cluster']['node_type'] == 'HeadNode' execute "Cleanup dna.json and extra.json from #{node['cluster']['shared_dir']}/dna" do - command "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/get_compute_user_data.py" \ + command "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/share_compute_fleet_dna.py" \ " --region #{node['cluster']['region']} --cleanup" timeout 30 retries 10 diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fetch_dna_files_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fetch_dna_files_spec.rb index 7a956fa166..7216bb60ea 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fetch_dna_files_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fetch_dna_files_spec.rb @@ -45,9 +45,9 @@ def self.cleanup(chef_run) # .with(source: "file://#{kitchen_instance_types_data_path}") # end - it 'runs get_compute_user_data.py to get dna files' do + it 'runs share_compute_fleet_dna.py to get dna files' do is_expected.to run_execute('Share dna.json with ComputeFleet').with( - command: "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/get_compute_user_data.py" \ + command: "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/share_compute_fleet_dna.py" \ " --region #{node['cluster']['region']}" ) end @@ -68,7 +68,7 @@ def self.cleanup(chef_run) it 'cleanups dna files' do is_expected.to run_execute("Cleanup dna.json and extra.json from #{node['cluster']['shared_dir']}/dna").with( - command: "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/get_compute_user_data.py" \ + command: "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/share_compute_fleet_dna.py" \ " --region #{node['cluster']['region']} --cleanup" ) end From adc3affed54a7d26382133b54a617f12f391bb99 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Tue, 18 Feb 2025 12:27:54 -0500 Subject: [PATCH 11/12] Add unit test for share_compute_fleet_dna.py --- pytest.ini | 1 + .../test_share_compute_fleet_dna.py | 110 ++++++++++++++++++ .../user_data_1.txt | 53 +++++++++ .../user_data_2.txt | 46 ++++++++ 4 files changed, 210 insertions(+) create mode 100644 test/unit/cfn_hup_configuration/test_share_compute_fleet_dna.py create mode 100644 test/unit/cfn_hup_configuration/test_share_compute_fleet_dna/test_get_write_directives_section/user_data_1.txt create mode 100644 test/unit/cfn_hup_configuration/test_share_compute_fleet_dna/test_get_write_directives_section/user_data_2.txt diff --git a/pytest.ini b/pytest.ini index 6629cd67a8..e60f70bc63 100644 --- a/pytest.ini +++ b/pytest.ini @@ -13,6 +13,7 @@ pythonpath = cookbooks/aws-parallelcluster-platform/files/dcv cookbooks/aws-parallelcluster-environment/files/cloudwatch + cookbooks/aws-parallelcluster-environment/files/cfn_hup_configuration cookbooks/aws-parallelcluster-computefleet/files/compute_fleet_status cookbooks/aws-parallelcluster-computefleet/files/clusterstatusmgtd cookbooks/aws-parallelcluster-environment/files/custom_action_executor diff --git a/test/unit/cfn_hup_configuration/test_share_compute_fleet_dna.py b/test/unit/cfn_hup_configuration/test_share_compute_fleet_dna.py new file mode 100644 index 0000000000..d46b200fbf --- /dev/null +++ b/test/unit/cfn_hup_configuration/test_share_compute_fleet_dna.py @@ -0,0 +1,110 @@ +# Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance +# with the License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. +import os +from unittest.mock import MagicMock, patch +import boto3 +import pytest +import json +from assertpy import assert_that +from botocore.stub import Stubber +from share_compute_fleet_dna import get_compute_launch_template_ids, get_user_data, get_write_directives_section, parse_proxy_config +from base64 import b64encode + + +@pytest.mark.parametrize( + ("launch_template_config_content", "errors"), + [ + ("{\"Queues\":{\"queue-0\":{\"ComputeResources\":{\"compute-resource-1\":{\"LaunchTemplate\":{\"Version\":\"1\",\"Id\":\"lt-037123456747c3bc5\"}},\"compute-resource-2\":{\"LaunchTemplate\":{\"Version\":\"1\",\"Id\":\"lt-0fcecb59a3721c0b3\"}},\"compute-resource-0\":{\"LaunchTemplate\":{\"Version\":\"1\",\"Id\":\"lt-12345678901234567\"}}}}}}", False), + ("{\"Queues\":{\"queue-0\":}}}", True), + + ], +) +def test_get_compute_launch_template_ids(mocker, launch_template_config_content, errors): + mocker.patch("builtins.open", mocker.mock_open(read_data=launch_template_config_content)) + actual_op = get_compute_launch_template_ids(launch_template_config_content) + if errors: + assert_that(actual_op).is_none() + else: + assert_that(actual_op).is_equal_to(json.loads(launch_template_config_content)) + + +@pytest.mark.parametrize( + ("mime_user_data_file", "write_section"), + [ + ("user_data_1.txt", + [{'path': '/tmp/dna.json', 'permissions': '0644', 'owner': 'root:root', 'content': '{"cluster":{"base_os":"alinux2","cluster_config_s3_key":"parallelcluster/clusters/dummy-cluster-randomstring123/configs/cluster-config-with-implied-values.yaml","cluster_config_version":"","cluster_name":"clustername","cluster_s3_bucket":"parallelcluster-a69601b5ee1fc2f2-v1-do-not-delete","cluster_user":"ec2-user","custom_awsbatchcli_package":"","custom_node_package":"","cw_logging_enabled":"true","default_user_home":"local","directory_service":{"domain_read_only_user":"","enabled":"false","generate_ssh_keys_for_users":"false"},"disable_sudo_access_for_default_user":"true","dns_domain":"{\\"Ref\\": \\"referencetoclusternameClusterDNSDomain8D0872E1Ref\\"}","ebs_shared_dirs":"","efs_encryption_in_transits":"","efs_fs_ids":"","efs_iam_authorizations":"","efs_shared_dirs":"","efs_access_point_ids":"","enable_intel_hpc_platform":"false","ephemeral_dir":"/scratch","fsx_dns_names":"","fsx_fs_ids":"","fsx_fs_types":"","fsx_mount_names":"","fsx_shared_dirs":"","fsx_volume_junction_paths":"","head_node_private_ip":"{\\"Ref\\": \\"referencetoclusternameHeadNodeENI6497A502PrimaryPrivateIpAddress\\"}","hosted_zone":"{\\"Ref\\": \\"referencetoclusternameRoute53HostedZone2388733DRef\\"}","dcv_enabled":"login_node","dcv_port":8444,"log_group_name":"/aws/parallelcluster/clustername-202401151530","log_rotation_enabled":"true","pool_name":"login","node_type":"LoginNode","proxy":"NONE","raid_shared_dir":"","raid_type":"","region":"us-east-1","scheduler":"slurm","shared_storage_type":"ebs","stack_arn":"{\\"Ref\\": \\"AWS::StackId\\"}","stack_name":"clustername","use_private_hostname":"false","launch_template_id":"LoginNodeLaunchTemplate2736fab291f04e69"}}\n'}, {'path': '/tmp/extra.json', 'permissions': '0644', 'owner': 'root:root', 'content': '{}\n'}, {'path': '/tmp/bootstrap.sh', 'permissions': '0744', 'owner': 'root:root', 'content': '#!/bin/bash -x\n\nfunction error_exit\n{\n echo "Bootstrap failed with error: $1"\n}\n'}] + ), + ("user_data_2.txt", + [{'content': '{"cluster":{"base_os":"alinux2"}}\n', 'owner': 'root:root', 'path': '/tmp/dna.json', 'permissions': '0644'}, {'content': '{"cluster": {"nvidia": {"enabled": "yes" }, "is_official_ami_build": "true"}}\n', 'owner': 'root:root', 'path': '/tmp/extra.json', 'permissions': '0644'}, {'content': '#!/bin/bash -x\n\necho "Bootstrap failed with error: $1"\n', 'owner': 'root:root', 'path': '/tmp/bootstrap.sh', 'permissions': '0744'}] + ), + ("",None), + ], +) +def test_get_write_directives_section(mime_user_data_file, write_section, test_datadir): + input_mime_user_data = None + if mime_user_data_file: + with open(os.path.join(test_datadir, mime_user_data_file), 'r') as file: + input_mime_user_data = file.read().strip() + + assert_that(get_write_directives_section(input_mime_user_data)).is_equal_to(write_section) + + +@pytest.mark.parametrize( + ("error", "proxy", "port"), + [ + (True, "myproxy.com", "8080"), + (False, "", "") + ] +) +def test_parse_proxy_config(error, proxy, port): + mock_config = MagicMock(return_value = error) + mock_config.get.side_effect = [ proxy, port ] + expected_op = {'https': proxy+':'+ port } + with patch('configparser.RawConfigParser', return_value=mock_config): + assert_that(parse_proxy_config().proxies).is_equal_to(expected_op) + + +def ec2_describe_launch_template_versions_mock(response, lt_id, lt_version): + e2_client = boto3.client('ec2', region_name='us-east-1') + stubber = Stubber(e2_client) + stubber.add_response('describe_launch_template_versions', response, { + 'LaunchTemplateId': lt_id, + 'Versions': [lt_version] + }) + stubber.activate() + return e2_client, stubber + + +@pytest.mark.parametrize( + ('expected_user_data' ), + [ + ("#!/bin/bash\necho 'Test'"), + ("") + + ], +) +def test_get_user_data(expected_user_data): + lt_id, lt_version = "lt-12345678901234567", "1" + ec2_response = { + "LaunchTemplateVersions": [{ + "LaunchTemplateData": { + "UserData": b64encode(expected_user_data.encode()).decode('utf-8') + } + }] + } + + ec2_client, stubber = ec2_describe_launch_template_versions_mock(ec2_response, lt_id, lt_version) + + with patch('boto3.client') as mock_client: + mock_client.return_value = ec2_client + with stubber: + assert_that(get_user_data(lt_id, lt_version, 'us-east-1')).is_equal_to(expected_user_data) + stubber.deactivate() \ No newline at end of file diff --git a/test/unit/cfn_hup_configuration/test_share_compute_fleet_dna/test_get_write_directives_section/user_data_1.txt b/test/unit/cfn_hup_configuration/test_share_compute_fleet_dna/test_get_write_directives_section/user_data_1.txt new file mode 100644 index 0000000000..3d4f182950 --- /dev/null +++ b/test/unit/cfn_hup_configuration/test_share_compute_fleet_dna/test_get_write_directives_section/user_data_1.txt @@ -0,0 +1,53 @@ +Content-Type: multipart/mixed; boundary="==BOUNDARY==" +MIME-Version: 1.0 + +--==BOUNDARY== +Content-Type: text/cloud-boothook; charset="us-ascii" +MIME-Version: 1.0 + +#!/bin/bash -x + + which dnf 2>/dev/null; dnf=$? + which yum 2>/dev/null; yum=$? + +--==BOUNDARY== +Content-Type: text/cloud-config; charset=us-ascii +MIME-Version: 1.0 + +bootcmd: + +output: + all: "| tee -a /var/log/cloud-init-output.log | logger -t user-data -s 2>/dev/ttyS0" +write_files: + - path: /tmp/dna.json + permissions: '0644' + owner: root:root + content: | + {"cluster":{"base_os":"alinux2","cluster_config_s3_key":"parallelcluster/clusters/dummy-cluster-randomstring123/configs/cluster-config-with-implied-values.yaml","cluster_config_version":"","cluster_name":"clustername","cluster_s3_bucket":"parallelcluster-a69601b5ee1fc2f2-v1-do-not-delete","cluster_user":"ec2-user","custom_awsbatchcli_package":"","custom_node_package":"","cw_logging_enabled":"true","default_user_home":"local","directory_service":{"domain_read_only_user":"","enabled":"false","generate_ssh_keys_for_users":"false"},"disable_sudo_access_for_default_user":"true","dns_domain":"{\"Ref\": \"referencetoclusternameClusterDNSDomain8D0872E1Ref\"}","ebs_shared_dirs":"","efs_encryption_in_transits":"","efs_fs_ids":"","efs_iam_authorizations":"","efs_shared_dirs":"","efs_access_point_ids":"","enable_intel_hpc_platform":"false","ephemeral_dir":"/scratch","fsx_dns_names":"","fsx_fs_ids":"","fsx_fs_types":"","fsx_mount_names":"","fsx_shared_dirs":"","fsx_volume_junction_paths":"","head_node_private_ip":"{\"Ref\": \"referencetoclusternameHeadNodeENI6497A502PrimaryPrivateIpAddress\"}","hosted_zone":"{\"Ref\": \"referencetoclusternameRoute53HostedZone2388733DRef\"}","dcv_enabled":"login_node","dcv_port":8444,"log_group_name":"/aws/parallelcluster/clustername-202401151530","log_rotation_enabled":"true","pool_name":"login","node_type":"LoginNode","proxy":"NONE","raid_shared_dir":"","raid_type":"","region":"us-east-1","scheduler":"slurm","shared_storage_type":"ebs","stack_arn":"{\"Ref\": \"AWS::StackId\"}","stack_name":"clustername","use_private_hostname":"false","launch_template_id":"LoginNodeLaunchTemplate2736fab291f04e69"}} + - path: /tmp/extra.json + permissions: '0644' + owner: root:root + content: | + {} + - path: /tmp/bootstrap.sh + permissions: '0744' + owner: root:root + content: | + #!/bin/bash -x + + function error_exit + { + echo "Bootstrap failed with error: $1" + } + +--==BOUNDARY== +Content-Type: text/x-shellscript; charset="us-ascii" +MIME-Version: 1.0 + +#!/bin/bash -x + +function error_exit +{ + exit 1 +} +--==BOUNDARY== diff --git a/test/unit/cfn_hup_configuration/test_share_compute_fleet_dna/test_get_write_directives_section/user_data_2.txt b/test/unit/cfn_hup_configuration/test_share_compute_fleet_dna/test_get_write_directives_section/user_data_2.txt new file mode 100644 index 0000000000..b4481a3c70 --- /dev/null +++ b/test/unit/cfn_hup_configuration/test_share_compute_fleet_dna/test_get_write_directives_section/user_data_2.txt @@ -0,0 +1,46 @@ +Content-Type: multipart/mixed; boundary="==BOUNDARY==" +MIME-Version: 1.0 + +--==BOUNDARY== +Content-Type: text/cloud-boothook; charset="us-ascii" +MIME-Version: 1.0 + +#!/bin/bash -x + + which dnf 2>/dev/null; dnf=$? + which yum 2>/dev/null; yum=$? + +--==BOUNDARY== +Content-Type: text/cloud-config; charset=us-ascii +MIME-Version: 1.0 + +write_files: + - path: /tmp/dna.json + permissions: '0644' + owner: root:root + content: | + {"cluster":{"base_os":"alinux2"}} + - path: /tmp/extra.json + permissions: '0644' + owner: root:root + content: | + {"cluster": {"nvidia": {"enabled": "yes" }, "is_official_ami_build": "true"}} + - path: /tmp/bootstrap.sh + permissions: '0744' + owner: root:root + content: | + #!/bin/bash -x + + echo "Bootstrap failed with error: $1" + +--==BOUNDARY== +Content-Type: text/x-shellscript; charset="us-ascii" +MIME-Version: 1.0 + +#!/bin/bash -x +function error_exit +{ + exit 1 +} + +--==BOUNDARY== \ No newline at end of file From 016d25d813ecbbe1ce442410eb4bef14d965f540 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Tue, 18 Feb 2025 14:02:28 -0500 Subject: [PATCH 12/12] Cookstyle and code-linters corrections * Correcting Kitchen and Unit tests * Adding share_compute_fleet_dna.py for tox checks --- .../share_compute_fleet_dna.py | 89 +++++----- .../recipes/install/cfn_bootstrap.rb | 2 +- .../spec/unit/recipes/cfn_bootstrap_spec.rb | 2 +- .../spec/unit/recipes/config_spec.rb | 1 - .../resources/cfn_hup_configuration_spec.rb | 82 ++++----- .../controls/cfn_hup_configuration_spec.rb | 2 +- .../unit/resources/fetch_dna_files_spec.rb | 4 +- .../recipes/update/update_head_node.rb | 2 +- .../test_share_compute_fleet_dna.py | 155 +++++++++++++----- .../user_data_1.txt | 4 +- tox.ini | 1 + 11 files changed, 210 insertions(+), 134 deletions(-) diff --git a/cookbooks/aws-parallelcluster-environment/files/cfn_hup_configuration/share_compute_fleet_dna.py b/cookbooks/aws-parallelcluster-environment/files/cfn_hup_configuration/share_compute_fleet_dna.py index ac67af520e..729a610fb7 100644 --- a/cookbooks/aws-parallelcluster-environment/files/cfn_hup_configuration/share_compute_fleet_dna.py +++ b/cookbooks/aws-parallelcluster-environment/files/cfn_hup_configuration/share_compute_fleet_dna.py @@ -11,30 +11,36 @@ # limitations under the License. -import configparser import argparse -from email import message_from_string +import base64 +import configparser import json +import logging import os +from email import message_from_string + import boto3 -from botocore.config import Config import yaml -import base64 -import logging +from botocore.config import Config from retrying import retry COMPUTE_FLEET_SHARED_LOCATION = "/opt/parallelcluster/shared/" -COMPUTE_FLEET_SHARED_DNA_LOCATION = COMPUTE_FLEET_SHARED_LOCATION + 'dna/' +COMPUTE_FLEET_SHARED_DNA_LOCATION = COMPUTE_FLEET_SHARED_LOCATION + "dna/" -COMPUTE_FLEET_LAUNCH_TEMPLATE_CONFIG = COMPUTE_FLEET_SHARED_LOCATION + 'launch-templates-config.json' +COMPUTE_FLEET_LAUNCH_TEMPLATE_CONFIG = COMPUTE_FLEET_SHARED_LOCATION + "launch-templates-config.json" logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) + def get_compute_launch_template_ids(lt_config_file_name): - """Load launch-templates-config.json which contains ID, Version number and Logical ID of all queues in Compute Fleet's Launch Template. - The format of launch-templates-config.json is + """ + Load launch-templates-config.json. + + It contains ID, Version number and Logical ID of all queues in Compute Fleet's Launch Template. + + The format of launch-templates-config.json: { "Queues": { "queue1": { @@ -61,24 +67,25 @@ def get_compute_launch_template_ids(lt_config_file_name): } } } + """ lt_config = None try: - with open(lt_config_file_name, 'r') as file: + with open(lt_config_file_name, "r", encoding="utf-8") as file: lt_config = json.loads(file.read()) except Exception as err: - logger.warn("Unable to read %s due to %s", lt_config_file_name, err) + logger.warning("Unable to read %s due to %s", lt_config_file_name, err) - return lt_config + return lt_config def share_compute_fleet_dna(args): - """Creates all dna.json for each queue in cluster.""" + """Create dna.json for each queue in cluster.""" lt_config = get_compute_launch_template_ids(COMPUTE_FLEET_LAUNCH_TEMPLATE_CONFIG) if lt_config: - all_queues = lt_config.get('Queues') + all_queues = lt_config.get("Queues") for _, queues in all_queues.items(): - compute_resources = queues.get('ComputeResources') + compute_resources = queues.get("ComputeResources") for _, compute_res in compute_resources.items(): get_latest_dna_data(compute_res, COMPUTE_FLEET_SHARED_DNA_LOCATION, args) @@ -98,11 +105,12 @@ def parse_proxy_config(): @retry(stop_max_attempt_number=5, wait_fixed=3000) def get_user_data(lt_id, lt_version, region_name): """ - Calls EC2 DescribeLaunchTemplateVersions API to get UserData from Launch Template specified. + Get UserData from specified Launch Template using EC2 DescribeLaunchTemplateVersions API. + :param lt_id: Launch Template ID (eg: lt-12345678901234567) :param lt_version: Launch Template latest Version Number (eg: 2) :param region_name: AWS region name (eg: us-east-1) - :return: User_data in MIME format + :return: string of user_data in MIME format """ decoded_data = None try: @@ -110,33 +118,30 @@ def get_user_data(lt_id, lt_version, region_name): ec2_client = boto3.client("ec2", region_name=region_name, config=proxy_config) response = ec2_client.describe_launch_template_versions( - LaunchTemplateId= lt_id, + LaunchTemplateId=lt_id, Versions=[ lt_version, ], - ).get('LaunchTemplateVersions') - decoded_data = base64.b64decode(response[0]['LaunchTemplateData']['UserData'], validate=True).decode('utf-8') + ).get("LaunchTemplateVersions") + decoded_data = base64.b64decode(response[0]["LaunchTemplateData"]["UserData"], validate=True).decode("utf-8") except Exception as err: if hasattr(err, "message"): err = err.message logger.error( - "Unable to get UserData for launch template %s with version %s.\nException: %s", - lt_id, lt_version, err + "Unable to get UserData for launch template %s with version %s.\nException: %s", lt_id, lt_version, err ) return decoded_data def get_write_directives_section(user_data): - """ - Parses MIME formatted UserData that we get from EC2 to extract write_files section from cloud-config section. - """ + """Get write_files section from cloud-config section of MIME formatted UserData.""" write_directives_section = None try: data = message_from_string(user_data) for cloud_config_section in data.walk(): - if cloud_config_section.get_content_type() == 'text/cloud-config': - write_directives_section = yaml.safe_load(cloud_config_section._payload).get('write_files') + if cloud_config_section.get_content_type() == "text/cloud-config": + write_directives_section = yaml.safe_load(cloud_config_section._payload).get("write_files") except Exception as err: logger.error("Error occurred while parsing write_files section.\nException: %s", err) return write_directives_section @@ -144,17 +149,18 @@ def get_write_directives_section(user_data): def write_dna_files(write_files_section, shared_storage_loc): """ - Writes the dna.json in shared location after extracting it from write_files section of UserData. + After extracting dna.json from write_files section of UserData, write it in shared location. + :param write_files_section: Entire write_files section from UserData :param shared_storage_loc: Shared Storage Location of where to write dna.json :return: None """ try: - file_path = shared_storage_loc+"-dna.json" + file_path = shared_storage_loc + "-dna.json" for data in write_files_section: - if data['path'] in ['/tmp/dna.json']: - with open(file_path,"w") as file: - file.write(json.dumps(json.loads(data['content']),indent=4)) + if data["path"] in ["/tmp/dna.json"]: # nosec B108 + with open(file_path, "w", encoding="utf-8") as file: + file.write(json.dumps(json.loads(data["content"]), indent=4)) except Exception as err: if hasattr(err, "message"): err = err.message @@ -163,16 +169,19 @@ def write_dna_files(write_files_section, shared_storage_loc): def get_latest_dna_data(resource, output_location, args): """ - Function to get latest User Data, extract relevant details and write dna.json. + Get latest User Data, extract relevant details and write dna.json. + :param resource: Resource containing LT ID, Version and Logical id :param output_location: Shared Storage Location were we want to write dna.json :param args: Command Line arguments :rtype: None """ - user_data = get_user_data(resource.get('LaunchTemplate').get('Id'), resource.get('LaunchTemplate').get('Version'), args.region) + user_data = get_user_data( + resource.get("LaunchTemplate").get("Id"), resource.get("LaunchTemplate").get("Version"), args.region + ) if user_data: write_directives = get_write_directives_section(user_data) - write_dna_files(write_directives, output_location+resource.get('LaunchTemplate').get("LogicalId")) + write_dna_files(write_directives, output_location + resource.get("LaunchTemplate").get("LogicalId")) def cleanup(directory_loc): @@ -183,7 +192,7 @@ def cleanup(directory_loc): if os.path.isfile(f_path): os.remove(f_path) except Exception as err: - logger.warn(f"Unable to delete %s due to %s", f_path, err) + logger.warning("Unable to delete %s due to %s", f_path, err) def _parse_cli_args(): @@ -195,9 +204,9 @@ def _parse_cli_args(): parser.add_argument( "-r", "--region", + required=False, type=str, default=os.getenv("AWS_REGION", None), - required=False, help="the cluster AWS region, defaults to AWS_REGION env variable", ) @@ -224,9 +233,11 @@ def main(): except Exception as err: if hasattr(err, "message"): err = err.message - logger.exception("Encountered exception when fetching latest dna.json for ComputeFleet, exiting gracefully: %s", err) + logger.exception( + "Encountered exception when fetching latest dna.json for ComputeFleet, exiting gracefully: %s", err + ) raise SystemExit(0) if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/cookbooks/aws-parallelcluster-environment/recipes/install/cfn_bootstrap.rb b/cookbooks/aws-parallelcluster-environment/recipes/install/cfn_bootstrap.rb index 7c86e45b7e..41fb845c4f 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/install/cfn_bootstrap.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/install/cfn_bootstrap.rb @@ -94,7 +94,7 @@ # Add cfn-hup runner template "#{node['cluster']['scripts_dir']}/cfn-hup-runner.sh" do - source "cfn_bootstrap/cfn-hup-runner.sh.erb" + source "cfn_hup_configuration/cfn-hup-runner.sh.erb" owner 'root' group 'root' mode '0744' diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/cfn_bootstrap_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/cfn_bootstrap_spec.rb index 4e12931a1f..73bd8cd4e5 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/cfn_bootstrap_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/cfn_bootstrap_spec.rb @@ -77,7 +77,7 @@ it 'adds cfn-hup runner' do is_expected.to create_template("#{node['cluster']['scripts_dir']}/cfn-hup-runner.sh").with( - source: "cfn_bootstrap/cfn-hup-runner.sh.erb", + source: "cfn_hup_configuration/cfn-hup-runner.sh.erb", owner: 'root', group: 'root', mode: '0744', diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/config_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/config_spec.rb index e4a80b2679..a0d1402048 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/config_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/config_spec.rb @@ -27,7 +27,6 @@ aws-parallelcluster-environment::raid aws-parallelcluster-environment::efs aws-parallelcluster-environment::fsx - aws-parallelcluster-environment::config_cfn_hup ) @expected_recipes.each do |recipe_name| allow_any_instance_of(Chef::Recipe).to receive(:include_recipe).with(recipe_name) do diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/cfn_hup_configuration_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/cfn_hup_configuration_spec.rb index 071aed0558..c522c426bc 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/cfn_hup_configuration_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/cfn_hup_configuration_spec.rb @@ -18,7 +18,7 @@ def self.configure(chef_run) LAUNCH_TEMPLATE_ID = "LAUNCH_TEMPLATE_ID".freeze SCRIPT_DIR = "SCRIPT_DIR".freeze MONITOR_SHARED_DIR = "MONITOR_SHARED_DIR".freeze -NODE_BOOTSTRAP_TIMEOUT = "1800" +NODE_BOOTSTRAP_TIMEOUT = "1800".freeze describe 'cfn_hup_configuration:configure' do for_all_oses do |platform, version| @@ -29,8 +29,8 @@ def self.configure(chef_run) runner = runner(platform: platform, version: version, step_into: ['cfn_hup_configuration']) do |node| allow_any_instance_of(Object).to receive(:get_metadata_token).and_return("IMDS_TOKEN") allow_any_instance_of(Object).to receive(:get_metadata_with_token) - .with("IMDS_TOKEN", URI("http://169.254.169.254/latest/meta-data/iam/security-credentials")) - .and_return(INSTANCE_ROLE_NAME) + .with("IMDS_TOKEN", URI("http://169.254.169.254/latest/meta-data/iam/security-credentials")) + .and_return(INSTANCE_ROLE_NAME) node.override["cluster"]["node_type"] = node_type node.override["cluster"]["region"] = AWS_REGION node.override["cluster"]["aws_domain"] = AWS_DOMAIN @@ -49,63 +49,63 @@ def self.configure(chef_run) %w(/etc/cfn /etc/cfn/hooks.d).each do |dir| it "creates the directory #{dir}" do is_expected.to create_directory(dir) - .with(owner: 'root') - .with(group: 'root') - .with(mode: "0700") - .with(recursive: true) + .with(owner: 'root') + .with(group: 'root') + .with(mode: "0700") + .with(recursive: true) end end it "creates the file /etc/cfn/cfn-hup.conf" do is_expected.to create_template("/etc/cfn/cfn-hup.conf") - .with(source: 'cfn_hup_configuration/cfn-hup.conf.erb') - .with(user: "root") - .with(group: "root") - .with(mode: "0400") - .with(variables: { - stack_id: STACK_ID, - region: AWS_REGION, - cloudformation_url: CLOUDFORMATION_URL, - cfn_init_role: INSTANCE_ROLE_NAME, - }) + .with(source: 'cfn_hup_configuration/cfn-hup.conf.erb') + .with(user: "root") + .with(group: "root") + .with(mode: "0400") + .with(variables: { + stack_id: STACK_ID, + region: AWS_REGION, + cloudformation_url: CLOUDFORMATION_URL, + cfn_init_role: INSTANCE_ROLE_NAME, + }) end it "creates the file /etc/cfn/hooks.d/pcluster-update.conf" do is_expected.to create_template("/etc/cfn/hooks.d/pcluster-update.conf") - .with(source: 'cfn_hup_configuration/cfn-hook-update.conf.erb') - .with(user: "root") - .with(group: "root") - .with(mode: "0400") - .with(variables: { - stack_id: STACK_ID, - region: AWS_REGION, - cloudformation_url: CLOUDFORMATION_URL, - cfn_init_role: INSTANCE_ROLE_NAME, - launch_template_resource_id: LAUNCH_TEMPLATE_ID, - update_hook_script_dir: SCRIPT_DIR, - node_bootstrap_timeout: NODE_BOOTSTRAP_TIMEOUT, - }) + .with(source: 'cfn_hup_configuration/cfn-hook-update.conf.erb') + .with(user: "root") + .with(group: "root") + .with(mode: "0400") + .with(variables: { + stack_id: STACK_ID, + region: AWS_REGION, + cloudformation_url: CLOUDFORMATION_URL, + cfn_init_role: INSTANCE_ROLE_NAME, + launch_template_resource_id: LAUNCH_TEMPLATE_ID, + update_hook_script_dir: SCRIPT_DIR, + node_bootstrap_timeout: NODE_BOOTSTRAP_TIMEOUT, + }) end if %(ComputeFleet).include?(node_type) it "creates the file #{SCRIPT_DIR}/cfn-hup-update-action.sh" do is_expected.to create_template("#{SCRIPT_DIR}/cfn-hup-update-action.sh") - .with(source: "cfn_hup_configuration/#{node_type}/cfn-hup-update-action.sh.erb") - .with(user: "root") - .with(group: "root") - .with(mode: "0700") - .with(variables: { - monitor_shared_dir: "#{MONITOR_SHARED_DIR}/dna", - launch_template_resource_id: LAUNCH_TEMPLATE_ID, + .with(source: "cfn_hup_configuration/#{node_type}/cfn-hup-update-action.sh.erb") + .with(user: "root") + .with(group: "root") + .with(mode: "0700") + .with(variables: { + monitor_shared_dir: "#{MONITOR_SHARED_DIR}/dna", + launch_template_resource_id: LAUNCH_TEMPLATE_ID, }) end elsif node_type == 'HeadNode' it "creates #{SCRIPT_DIR}/share_compute_fleet_dna.py" do is_expected.to create_if_missing_cookbook_file("#{SCRIPT_DIR}/share_compute_fleet_dna.py") - .with(source: 'cfn_hup_configuration/share_compute_fleet_dna.py') - .with(user: 'root') - .with(group: 'root') - .with(mode: '0700') + .with(source: 'cfn_hup_configuration/share_compute_fleet_dna.py') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0700') end it "creates the directory #{MONITOR_SHARED_DIR}/dna" do diff --git a/cookbooks/aws-parallelcluster-environment/test/controls/cfn_hup_configuration_spec.rb b/cookbooks/aws-parallelcluster-environment/test/controls/cfn_hup_configuration_spec.rb index 54170f695c..49a3fb7d43 100644 --- a/cookbooks/aws-parallelcluster-environment/test/controls/cfn_hup_configuration_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/test/controls/cfn_hup_configuration_spec.rb @@ -42,7 +42,7 @@ its('group') { should eq 'root' } end - describe directory("#{node['cluster']['base_dir']}/dna") do + describe directory("#{node['cluster']['shared_dir']}/dna") do it { should exist } end end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fetch_dna_files_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fetch_dna_files_spec.rb index 7216bb60ea..8a6d937814 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fetch_dna_files_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fetch_dna_files_spec.rb @@ -35,7 +35,7 @@ def self.cleanup(chef_run) node.override['cluster']['region'] = region node.override['kitchen'] = true end - ConvergeFetchDnaFiles.share(runner, extra_chef_attribute_location: "#{kitchen_instance_types_data_path}") + ConvergeFetchDnaFiles.share(runner, extra_chef_attribute_location: "#{kitchen_instance_types_data_path}") end cached(:node) { chef_run.node } @@ -46,7 +46,7 @@ def self.cleanup(chef_run) # end it 'runs share_compute_fleet_dna.py to get dna files' do - is_expected.to run_execute('Share dna.json with ComputeFleet').with( + is_expected.to run_execute('Run share_compute_fleet_dna.py to get user_data.sh and share dna.json with ComputeFleet').with( command: "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/share_compute_fleet_dna.py" \ " --region #{node['cluster']['region']}" ) diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb b/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb index b6ce642b93..7a131f706a 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb @@ -284,4 +284,4 @@ def update_nodes_in_queue(strategy, queues) fetch_dna_files 'Cleanup' do action :cleanup -end \ No newline at end of file +end diff --git a/test/unit/cfn_hup_configuration/test_share_compute_fleet_dna.py b/test/unit/cfn_hup_configuration/test_share_compute_fleet_dna.py index d46b200fbf..cc85c2c098 100644 --- a/test/unit/cfn_hup_configuration/test_share_compute_fleet_dna.py +++ b/test/unit/cfn_hup_configuration/test_share_compute_fleet_dna.py @@ -8,23 +8,58 @@ # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and # limitations under the License. +import json import os +from base64 import b64encode from unittest.mock import MagicMock, patch + import boto3 import pytest -import json from assertpy import assert_that from botocore.stub import Stubber -from share_compute_fleet_dna import get_compute_launch_template_ids, get_user_data, get_write_directives_section, parse_proxy_config -from base64 import b64encode +from share_compute_fleet_dna import ( + get_compute_launch_template_ids, + get_user_data, + get_write_directives_section, + parse_proxy_config, +) @pytest.mark.parametrize( ("launch_template_config_content", "errors"), [ - ("{\"Queues\":{\"queue-0\":{\"ComputeResources\":{\"compute-resource-1\":{\"LaunchTemplate\":{\"Version\":\"1\",\"Id\":\"lt-037123456747c3bc5\"}},\"compute-resource-2\":{\"LaunchTemplate\":{\"Version\":\"1\",\"Id\":\"lt-0fcecb59a3721c0b3\"}},\"compute-resource-0\":{\"LaunchTemplate\":{\"Version\":\"1\",\"Id\":\"lt-12345678901234567\"}}}}}}", False), - ("{\"Queues\":{\"queue-0\":}}}", True), - + ( + """ + { + "Queues": { + "queue-0": { + "ComputeResources": { + "compute-resource-1": { + "LaunchTemplate": { + "Version": "1", + "Id": "lt-037123456747c3bc5" + } + }, + "compute-resource-2": { + "LaunchTemplate": { + "Version": "1", + "Id": "lt-0fcecb59a3721c0b3" + } + }, + "compute-resource-0": { + "LaunchTemplate": { + "Version": "1", + "Id": "lt-12345678901234567" + } + } + } + } + } + } + """, + False, + ), + ('{"Queues":{"queue-0":}}}', True), ], ) def test_get_compute_launch_template_ids(mocker, launch_template_config_content, errors): @@ -39,72 +74,102 @@ def test_get_compute_launch_template_ids(mocker, launch_template_config_content, @pytest.mark.parametrize( ("mime_user_data_file", "write_section"), [ - ("user_data_1.txt", - [{'path': '/tmp/dna.json', 'permissions': '0644', 'owner': 'root:root', 'content': '{"cluster":{"base_os":"alinux2","cluster_config_s3_key":"parallelcluster/clusters/dummy-cluster-randomstring123/configs/cluster-config-with-implied-values.yaml","cluster_config_version":"","cluster_name":"clustername","cluster_s3_bucket":"parallelcluster-a69601b5ee1fc2f2-v1-do-not-delete","cluster_user":"ec2-user","custom_awsbatchcli_package":"","custom_node_package":"","cw_logging_enabled":"true","default_user_home":"local","directory_service":{"domain_read_only_user":"","enabled":"false","generate_ssh_keys_for_users":"false"},"disable_sudo_access_for_default_user":"true","dns_domain":"{\\"Ref\\": \\"referencetoclusternameClusterDNSDomain8D0872E1Ref\\"}","ebs_shared_dirs":"","efs_encryption_in_transits":"","efs_fs_ids":"","efs_iam_authorizations":"","efs_shared_dirs":"","efs_access_point_ids":"","enable_intel_hpc_platform":"false","ephemeral_dir":"/scratch","fsx_dns_names":"","fsx_fs_ids":"","fsx_fs_types":"","fsx_mount_names":"","fsx_shared_dirs":"","fsx_volume_junction_paths":"","head_node_private_ip":"{\\"Ref\\": \\"referencetoclusternameHeadNodeENI6497A502PrimaryPrivateIpAddress\\"}","hosted_zone":"{\\"Ref\\": \\"referencetoclusternameRoute53HostedZone2388733DRef\\"}","dcv_enabled":"login_node","dcv_port":8444,"log_group_name":"/aws/parallelcluster/clustername-202401151530","log_rotation_enabled":"true","pool_name":"login","node_type":"LoginNode","proxy":"NONE","raid_shared_dir":"","raid_type":"","region":"us-east-1","scheduler":"slurm","shared_storage_type":"ebs","stack_arn":"{\\"Ref\\": \\"AWS::StackId\\"}","stack_name":"clustername","use_private_hostname":"false","launch_template_id":"LoginNodeLaunchTemplate2736fab291f04e69"}}\n'}, {'path': '/tmp/extra.json', 'permissions': '0644', 'owner': 'root:root', 'content': '{}\n'}, {'path': '/tmp/bootstrap.sh', 'permissions': '0744', 'owner': 'root:root', 'content': '#!/bin/bash -x\n\nfunction error_exit\n{\n echo "Bootstrap failed with error: $1"\n}\n'}] - ), - ("user_data_2.txt", - [{'content': '{"cluster":{"base_os":"alinux2"}}\n', 'owner': 'root:root', 'path': '/tmp/dna.json', 'permissions': '0644'}, {'content': '{"cluster": {"nvidia": {"enabled": "yes" }, "is_official_ami_build": "true"}}\n', 'owner': 'root:root', 'path': '/tmp/extra.json', 'permissions': '0644'}, {'content': '#!/bin/bash -x\n\necho "Bootstrap failed with error: $1"\n', 'owner': 'root:root', 'path': '/tmp/bootstrap.sh', 'permissions': '0744'}] - ), - ("",None), + ( + "user_data_1.txt", + [ + { + "path": "/tmp/dna.json", # nosec B108 + "permissions": "0644", + "owner": "root:root", + "content": '{"cluster":{"base_os":"alinux2","cluster_name":"clustername",' + '"directory_service":{"domain_read_only_user":"","enabled":"false",' + '"generate_ssh_keys_for_users":"false"},' + '"launch_template_id":"LoginNodeLaunchTemplate2736fab291f04e69"}}\n', + }, + { + "path": "/tmp/extra.json", # nosec B108 + "permissions": "0644", + "owner": "root:root", + "content": "{}\n", + }, + { + "path": "/tmp/bootstrap.sh", # nosec B108 + "permissions": "0744", + "owner": "root:root", + "content": '#!/bin/bash -x\n\nfunction error_exit\n{\n echo "Bootstrap failed"\n}\n', + }, + ], + ), + ( + "user_data_2.txt", + [ + { + "content": '{"cluster":{"base_os":"alinux2"}}\n', + "owner": "root:root", + "path": "/tmp/dna.json", # nosec B108 + "permissions": "0644", + }, + { + "content": '{"cluster": {"nvidia": {"enabled": "yes" }, "is_official_ami_build": "true"}}\n', + "owner": "root:root", + "path": "/tmp/extra.json", # nosec B108 + "permissions": "0644", + }, + { + "content": '#!/bin/bash -x\n\necho "Bootstrap failed with error: $1"\n', + "owner": "root:root", + "path": "/tmp/bootstrap.sh", # nosec B108 + "permissions": "0744", + }, + ], + ), + ("", None), ], ) def test_get_write_directives_section(mime_user_data_file, write_section, test_datadir): input_mime_user_data = None if mime_user_data_file: - with open(os.path.join(test_datadir, mime_user_data_file), 'r') as file: + with open(os.path.join(test_datadir, mime_user_data_file), "r", encoding="utf-8") as file: input_mime_user_data = file.read().strip() assert_that(get_write_directives_section(input_mime_user_data)).is_equal_to(write_section) -@pytest.mark.parametrize( - ("error", "proxy", "port"), - [ - (True, "myproxy.com", "8080"), - (False, "", "") - ] -) +@pytest.mark.parametrize(("error", "proxy", "port"), [(True, "myproxy.com", "8080"), (False, "", "")]) def test_parse_proxy_config(error, proxy, port): - mock_config = MagicMock(return_value = error) - mock_config.get.side_effect = [ proxy, port ] - expected_op = {'https': proxy+':'+ port } - with patch('configparser.RawConfigParser', return_value=mock_config): + mock_config = MagicMock(return_value=error) + mock_config.get.side_effect = [proxy, port] + expected_op = {"https": proxy + ":" + port} + with patch("configparser.RawConfigParser", return_value=mock_config): assert_that(parse_proxy_config().proxies).is_equal_to(expected_op) def ec2_describe_launch_template_versions_mock(response, lt_id, lt_version): - e2_client = boto3.client('ec2', region_name='us-east-1') + e2_client = boto3.client("ec2", region_name="us-east-1") stubber = Stubber(e2_client) - stubber.add_response('describe_launch_template_versions', response, { - 'LaunchTemplateId': lt_id, - 'Versions': [lt_version] - }) + stubber.add_response( + "describe_launch_template_versions", response, {"LaunchTemplateId": lt_id, "Versions": [lt_version]} + ) stubber.activate() return e2_client, stubber @pytest.mark.parametrize( - ('expected_user_data' ), - [ - ("#!/bin/bash\necho 'Test'"), - ("") - - ], + ("expected_user_data"), + [("#!/bin/bash\necho 'Test'"), ("")], ) def test_get_user_data(expected_user_data): lt_id, lt_version = "lt-12345678901234567", "1" ec2_response = { - "LaunchTemplateVersions": [{ - "LaunchTemplateData": { - "UserData": b64encode(expected_user_data.encode()).decode('utf-8') - } - }] - } + "LaunchTemplateVersions": [ + {"LaunchTemplateData": {"UserData": b64encode(expected_user_data.encode()).decode("utf-8")}} + ] + } ec2_client, stubber = ec2_describe_launch_template_versions_mock(ec2_response, lt_id, lt_version) - with patch('boto3.client') as mock_client: + with patch("boto3.client") as mock_client: mock_client.return_value = ec2_client with stubber: - assert_that(get_user_data(lt_id, lt_version, 'us-east-1')).is_equal_to(expected_user_data) - stubber.deactivate() \ No newline at end of file + assert_that(get_user_data(lt_id, lt_version, "us-east-1")).is_equal_to(expected_user_data) + stubber.deactivate() diff --git a/test/unit/cfn_hup_configuration/test_share_compute_fleet_dna/test_get_write_directives_section/user_data_1.txt b/test/unit/cfn_hup_configuration/test_share_compute_fleet_dna/test_get_write_directives_section/user_data_1.txt index 3d4f182950..e8e35affad 100644 --- a/test/unit/cfn_hup_configuration/test_share_compute_fleet_dna/test_get_write_directives_section/user_data_1.txt +++ b/test/unit/cfn_hup_configuration/test_share_compute_fleet_dna/test_get_write_directives_section/user_data_1.txt @@ -23,7 +23,7 @@ write_files: permissions: '0644' owner: root:root content: | - {"cluster":{"base_os":"alinux2","cluster_config_s3_key":"parallelcluster/clusters/dummy-cluster-randomstring123/configs/cluster-config-with-implied-values.yaml","cluster_config_version":"","cluster_name":"clustername","cluster_s3_bucket":"parallelcluster-a69601b5ee1fc2f2-v1-do-not-delete","cluster_user":"ec2-user","custom_awsbatchcli_package":"","custom_node_package":"","cw_logging_enabled":"true","default_user_home":"local","directory_service":{"domain_read_only_user":"","enabled":"false","generate_ssh_keys_for_users":"false"},"disable_sudo_access_for_default_user":"true","dns_domain":"{\"Ref\": \"referencetoclusternameClusterDNSDomain8D0872E1Ref\"}","ebs_shared_dirs":"","efs_encryption_in_transits":"","efs_fs_ids":"","efs_iam_authorizations":"","efs_shared_dirs":"","efs_access_point_ids":"","enable_intel_hpc_platform":"false","ephemeral_dir":"/scratch","fsx_dns_names":"","fsx_fs_ids":"","fsx_fs_types":"","fsx_mount_names":"","fsx_shared_dirs":"","fsx_volume_junction_paths":"","head_node_private_ip":"{\"Ref\": \"referencetoclusternameHeadNodeENI6497A502PrimaryPrivateIpAddress\"}","hosted_zone":"{\"Ref\": \"referencetoclusternameRoute53HostedZone2388733DRef\"}","dcv_enabled":"login_node","dcv_port":8444,"log_group_name":"/aws/parallelcluster/clustername-202401151530","log_rotation_enabled":"true","pool_name":"login","node_type":"LoginNode","proxy":"NONE","raid_shared_dir":"","raid_type":"","region":"us-east-1","scheduler":"slurm","shared_storage_type":"ebs","stack_arn":"{\"Ref\": \"AWS::StackId\"}","stack_name":"clustername","use_private_hostname":"false","launch_template_id":"LoginNodeLaunchTemplate2736fab291f04e69"}} + {"cluster":{"base_os":"alinux2","cluster_name":"clustername","directory_service":{"domain_read_only_user":"","enabled":"false","generate_ssh_keys_for_users":"false"},"launch_template_id":"LoginNodeLaunchTemplate2736fab291f04e69"}} - path: /tmp/extra.json permissions: '0644' owner: root:root @@ -37,7 +37,7 @@ write_files: function error_exit { - echo "Bootstrap failed with error: $1" + echo "Bootstrap failed" } --==BOUNDARY== diff --git a/tox.ini b/tox.ini index 39606116c1..c5b7cb6c57 100644 --- a/tox.ini +++ b/tox.ini @@ -38,6 +38,7 @@ src_dirs = {toxinidir}/cookbooks/aws-parallelcluster-computefleet/files/clusterstatusmgtd \ {toxinidir}/cookbooks/aws-parallelcluster-environment/files/custom_action_executor \ {toxinidir}/cookbooks/aws-parallelcluster-environment/files/default/ec2_udev_rules \ + {toxinidir}/cookbooks/aws-parallelcluster-environment/files/cfn_hup_configuration \ {toxinidir}/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm \ {toxinidir}/cookbooks/aws-parallelcluster-slurm/files/default/head_node_checks \ {toxinidir}/cookbooks/aws-parallelcluster-slurm/files/default/config_slurm/scripts