|
| 1 | +# Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"). |
| 4 | +# You may not use this file except in compliance with the |
| 5 | +# License. A copy of the License is located at |
| 6 | +# |
| 7 | +# http://aws.amazon.com/apache2.0/ |
| 8 | +# |
| 9 | +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES |
| 10 | +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and |
| 11 | +# limitations under the License. |
| 12 | + |
| 13 | + |
| 14 | +import argparse |
| 15 | +import base64 |
| 16 | +import configparser |
| 17 | +import json |
| 18 | +import logging |
| 19 | +import os |
| 20 | +from email import message_from_string |
| 21 | + |
| 22 | +import boto3 |
| 23 | +import yaml |
| 24 | +from botocore.config import Config |
| 25 | +from retrying import retry |
| 26 | + |
| 27 | +COMPUTE_FLEET_SHARED_LOCATION = "/opt/parallelcluster/shared/" |
| 28 | + |
| 29 | +COMPUTE_FLEET_SHARED_DNA_LOCATION = COMPUTE_FLEET_SHARED_LOCATION + "dna/" |
| 30 | + |
| 31 | +COMPUTE_FLEET_LAUNCH_TEMPLATE_CONFIG = COMPUTE_FLEET_SHARED_LOCATION + "launch-templates-config.json" |
| 32 | + |
| 33 | +logger = logging.getLogger(__name__) |
| 34 | +logging.basicConfig(level=logging.INFO) |
| 35 | + |
| 36 | + |
| 37 | +def get_compute_launch_template_ids(lt_config_file_name): |
| 38 | + """ |
| 39 | + Load launch-templates-config.json. |
| 40 | +
|
| 41 | + It contains ID, Version number and Logical ID of all queues in Compute Fleet's Launch Template. |
| 42 | +
|
| 43 | + The format of launch-templates-config.json: |
| 44 | + { |
| 45 | + "Queues": { |
| 46 | + "queue1": { |
| 47 | + "ComputeResources": { |
| 48 | + "queue1-i1": { |
| 49 | + "LaunchTemplate": { |
| 50 | + "Version": "1", |
| 51 | + "LogicalId": "LaunchTemplate123456789012345", |
| 52 | + "Id": "lt-12345678901234567" |
| 53 | + } |
| 54 | + } |
| 55 | + } |
| 56 | + }, |
| 57 | + "queue2": { |
| 58 | + "ComputeResources": { |
| 59 | + "queue2-i1": { |
| 60 | + "LaunchTemplate": { |
| 61 | + "Version": "1", |
| 62 | + "LogicalId": "LaunchTemplate012345678901234", |
| 63 | + "Id": "lt-01234567890123456" |
| 64 | + } |
| 65 | + } |
| 66 | + } |
| 67 | + } |
| 68 | + } |
| 69 | + } |
| 70 | +
|
| 71 | + """ |
| 72 | + lt_config = None |
| 73 | + try: |
| 74 | + with open(lt_config_file_name, "r", encoding="utf-8") as file: |
| 75 | + lt_config = json.loads(file.read()) |
| 76 | + except Exception as err: |
| 77 | + logger.warning("Unable to read %s due to %s", lt_config_file_name, err) |
| 78 | + |
| 79 | + return lt_config |
| 80 | + |
| 81 | + |
| 82 | +def share_compute_fleet_dna(args): |
| 83 | + """Create dna.json for each queue in cluster.""" |
| 84 | + lt_config = get_compute_launch_template_ids(COMPUTE_FLEET_LAUNCH_TEMPLATE_CONFIG) |
| 85 | + if lt_config: |
| 86 | + all_queues = lt_config.get("Queues") |
| 87 | + for _, queues in all_queues.items(): |
| 88 | + compute_resources = queues.get("ComputeResources") |
| 89 | + for _, compute_res in compute_resources.items(): |
| 90 | + get_latest_dna_data(compute_res, COMPUTE_FLEET_SHARED_DNA_LOCATION, args) |
| 91 | + |
| 92 | + |
| 93 | +# FIXME: Fix Code Duplication |
| 94 | +def parse_proxy_config(): |
| 95 | + config = configparser.RawConfigParser() |
| 96 | + config.read("/etc/boto.cfg") |
| 97 | + proxy_config = Config() |
| 98 | + if config.has_option("Boto", "proxy") and config.has_option("Boto", "proxy_port"): |
| 99 | + proxy = config.get("Boto", "proxy") |
| 100 | + proxy_port = config.get("Boto", "proxy_port") |
| 101 | + proxy_config = Config(proxies={"https": f"{proxy}:{proxy_port}"}) |
| 102 | + return proxy_config |
| 103 | + |
| 104 | + |
| 105 | +@retry(stop_max_attempt_number=5, wait_fixed=3000) |
| 106 | +def get_user_data(lt_id, lt_version, region_name): |
| 107 | + """ |
| 108 | + Get UserData from specified Launch Template using EC2 DescribeLaunchTemplateVersions API. |
| 109 | +
|
| 110 | + :param lt_id: Launch Template ID (eg: lt-12345678901234567) |
| 111 | + :param lt_version: Launch Template latest Version Number (eg: 2) |
| 112 | + :param region_name: AWS region name (eg: us-east-1) |
| 113 | + :return: string of user_data in MIME format |
| 114 | + """ |
| 115 | + decoded_data = None |
| 116 | + try: |
| 117 | + proxy_config = parse_proxy_config() |
| 118 | + |
| 119 | + ec2_client = boto3.client("ec2", region_name=region_name, config=proxy_config) |
| 120 | + response = ec2_client.describe_launch_template_versions( |
| 121 | + LaunchTemplateId=lt_id, |
| 122 | + Versions=[ |
| 123 | + lt_version, |
| 124 | + ], |
| 125 | + ).get("LaunchTemplateVersions") |
| 126 | + decoded_data = base64.b64decode(response[0]["LaunchTemplateData"]["UserData"], validate=True).decode("utf-8") |
| 127 | + except Exception as err: |
| 128 | + if hasattr(err, "message"): |
| 129 | + err = err.message |
| 130 | + logger.error( |
| 131 | + "Unable to get UserData for launch template %s with version %s.\nException: %s", lt_id, lt_version, err |
| 132 | + ) |
| 133 | + |
| 134 | + return decoded_data |
| 135 | + |
| 136 | + |
| 137 | +def get_write_directives_section(user_data): |
| 138 | + """Get write_files section from cloud-config section of MIME formatted UserData.""" |
| 139 | + write_directives_section = None |
| 140 | + try: |
| 141 | + data = message_from_string(user_data) |
| 142 | + for cloud_config_section in data.walk(): |
| 143 | + if cloud_config_section.get_content_type() == "text/cloud-config": |
| 144 | + write_directives_section = yaml.safe_load(cloud_config_section._payload).get("write_files") |
| 145 | + except Exception as err: |
| 146 | + logger.error("Error occurred while parsing write_files section.\nException: %s", err) |
| 147 | + return write_directives_section |
| 148 | + |
| 149 | + |
| 150 | +def write_dna_files(write_files_section, shared_storage_loc): |
| 151 | + """ |
| 152 | + After extracting dna.json from write_files section of UserData, write it in shared location. |
| 153 | +
|
| 154 | + :param write_files_section: Entire write_files section from UserData |
| 155 | + :param shared_storage_loc: Shared Storage Location of where to write dna.json |
| 156 | + :return: None |
| 157 | + """ |
| 158 | + try: |
| 159 | + file_path = shared_storage_loc + "-dna.json" |
| 160 | + for data in write_files_section: |
| 161 | + if data["path"] in ["/tmp/dna.json"]: # nosec B108 |
| 162 | + with open(file_path, "w", encoding="utf-8") as file: |
| 163 | + file.write(json.dumps(json.loads(data["content"]), indent=4)) |
| 164 | + except Exception as err: |
| 165 | + if hasattr(err, "message"): |
| 166 | + err = err.message |
| 167 | + logger.error("Unable to write %s due to %s", file_path, err) |
| 168 | + |
| 169 | + |
| 170 | +def get_latest_dna_data(resource, output_location, args): |
| 171 | + """ |
| 172 | + Get latest User Data, extract relevant details and write dna.json. |
| 173 | +
|
| 174 | + :param resource: Resource containing LT ID, Version and Logical id |
| 175 | + :param output_location: Shared Storage Location were we want to write dna.json |
| 176 | + :param args: Command Line arguments |
| 177 | + :rtype: None |
| 178 | + """ |
| 179 | + user_data = get_user_data( |
| 180 | + resource.get("LaunchTemplate").get("Id"), resource.get("LaunchTemplate").get("Version"), args.region |
| 181 | + ) |
| 182 | + if user_data: |
| 183 | + write_directives = get_write_directives_section(user_data) |
| 184 | + write_dna_files(write_directives, output_location + resource.get("LaunchTemplate").get("LogicalId")) |
| 185 | + |
| 186 | + |
| 187 | +def cleanup(directory_loc): |
| 188 | + """Cleanup dna.json and extra.json files.""" |
| 189 | + for f in os.listdir(directory_loc): |
| 190 | + f_path = os.path.join(directory_loc, f) |
| 191 | + try: |
| 192 | + if os.path.isfile(f_path): |
| 193 | + os.remove(f_path) |
| 194 | + except Exception as err: |
| 195 | + logger.warning("Unable to delete %s due to %s", f_path, err) |
| 196 | + |
| 197 | + |
| 198 | +def _parse_cli_args(): |
| 199 | + """Parse command line args.""" |
| 200 | + parser = argparse.ArgumentParser( |
| 201 | + description="Get latest User Data from ComputeFleet Launch Templates.", exit_on_error=False |
| 202 | + ) |
| 203 | + |
| 204 | + parser.add_argument( |
| 205 | + "-r", |
| 206 | + "--region", |
| 207 | + required=False, |
| 208 | + type=str, |
| 209 | + default=os.getenv("AWS_REGION", None), |
| 210 | + help="the cluster AWS region, defaults to AWS_REGION env variable", |
| 211 | + ) |
| 212 | + |
| 213 | + parser.add_argument( |
| 214 | + "-c", |
| 215 | + "--cleanup", |
| 216 | + action="store_true", |
| 217 | + required=False, |
| 218 | + help="Cleanup DNA files created", |
| 219 | + ) |
| 220 | + |
| 221 | + args = parser.parse_args() |
| 222 | + |
| 223 | + return args |
| 224 | + |
| 225 | + |
| 226 | +def main(): |
| 227 | + try: |
| 228 | + args = _parse_cli_args() |
| 229 | + if args.cleanup: |
| 230 | + cleanup(COMPUTE_FLEET_SHARED_DNA_LOCATION) |
| 231 | + else: |
| 232 | + share_compute_fleet_dna(args) |
| 233 | + except Exception as err: |
| 234 | + if hasattr(err, "message"): |
| 235 | + err = err.message |
| 236 | + logger.exception( |
| 237 | + "Encountered exception when fetching latest dna.json for ComputeFleet, exiting gracefully: %s", err |
| 238 | + ) |
| 239 | + raise SystemExit(0) |
| 240 | + |
| 241 | + |
| 242 | +if __name__ == "__main__": |
| 243 | + main() |
0 commit comments