Skip to content

Commit ad81a3d

Browse files
himani2411Himani Anil Deshpande
authored andcommitted
[Scaling] Removing usage of cfn-init for compute fleet (aws#6655)
* [CN] Removing usage of cfn-init in compute Node bootstrapping * Using cloud-init native approach for creating files * Giving HN DescribeLaunchTemplateVersions Permissions * Adding CFN LogicalId for Compute Launch Template * [Unit Test] Update user_data variables for head Node * Change dna.json and extra.json test for compute fleet * Code-linting changes * Removing unwanted comments * Removing unused variables and imports --------- Co-authored-by: Himani Anil Deshpande <[email protected]>
1 parent deb8e97 commit ad81a3d

File tree

6 files changed

+124
-186
lines changed

6 files changed

+124
-186
lines changed

cli/src/pcluster/resources/compute_node/user_data.sh

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,16 @@ datasource_list: [ Ec2, None ]
5151
output:
5252
all: "| tee -a /var/log/cloud-init-output.log | logger -t user-data -s 2>/dev/ttyS0"
5353
write_files:
54+
- path: /tmp/dna.json
55+
permissions: '0644'
56+
owner: root:root
57+
content: |
58+
${DnaJson}
59+
- path: /tmp/extra.json
60+
permissions: '0644'
61+
owner: root:root
62+
content: |
63+
${ExtraJson}
5464
- path: /tmp/bootstrap.sh
5565
permissions: '0744'
5666
owner: root:root
@@ -99,8 +109,6 @@ write_files:
99109

100110
[ -f /etc/profile.d/proxy.sh ] && . /etc/profile.d/proxy.sh
101111

102-
$CFN_BOOTSTRAP_VIRTUALENV_PATH/cfn-init -s ${AWS::StackName} -v -c deployFiles -r ${LaunchTemplateResourceId} --region ${AWS::Region} --url ${CloudFormationUrl} --role ${CfnInitRole} || error_exit 'Failed to bootstrap the compute node. Please check /var/log/cfn-init.log in the compute node or in CloudWatch logs. Please refer to https://docs.aws.amazon.com/parallelcluster/latest/ug/troubleshooting-v3.html#troubleshooting-v3-get-logs for more details on ParallelCluster logs.'
103-
104112
[ -f /etc/profile.d/aws-cli-default-config.sh ] && . /etc/profile.d/aws-cli-default-config.sh
105113

106114
custom_cookbook=${CustomChefCookbook}
@@ -139,9 +147,11 @@ write_files:
139147
vendor_cookbook
140148
fi
141149
cd /tmp
150+
mkdir -p /etc/chef/ohai/hints
151+
touch /etc/chef/ohai/hints/ec2.json
142152

143153
start=$(date +%s)
144-
154+
jq -s ".[0] * .[1]" /tmp/dna.json /tmp/extra.json > /etc/chef/dna.json || ( echo "jq not installed"; cp /tmp/dna.json /etc/chef/dna.json )
145155
{
146156
CINC_CMD="cinc-client --local-mode --config /etc/chef/client.rb --log_level info --logfile /var/log/chef-client.log --force-formatter --no-color --chef-zero-port 8889 --json-attributes /etc/chef/dna.json --override-runlist"
147157
FR_CMD="/opt/parallelcluster/scripts/fetch_and_run"

cli/src/pcluster/templates/cdk_builder_utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -605,6 +605,7 @@ def _build_policy(self) -> List[iam.PolicyStatement]:
605605
iam.PolicyStatement(
606606
sid="Ec2",
607607
actions=[
608+
"ec2:DescribeLaunchTemplateVersions",
608609
"ec2:DescribeInstanceAttribute",
609610
"ec2:DescribeInstances",
610611
"ec2:DescribeInstanceStatus",

cli/src/pcluster/templates/cluster_stack.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1523,7 +1523,11 @@ def _get_launch_templates_config(self):
15231523
lt_config["Queues"][queue] = {"ComputeResources": {}}
15241524
for compute_resource, launch_template in compute_resources.items():
15251525
lt_config["Queues"][queue]["ComputeResources"][compute_resource] = {
1526-
"LaunchTemplate": {"Id": launch_template.ref, "Version": launch_template.attr_latest_version_number}
1526+
"LaunchTemplate": {
1527+
"Id": launch_template.ref,
1528+
"Version": launch_template.attr_latest_version_number,
1529+
"LogicalId": launch_template.logical_id,
1530+
}
15271531
}
15281532

15291533
return lt_config

cli/src/pcluster/templates/queues_stack.py

Lines changed: 92 additions & 165 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
to_comma_separated_string,
3434
)
3535
from pcluster.templates.slurm_builder import SlurmConstruct
36-
from pcluster.utils import get_attr, get_http_tokens_setting, get_resource_name_from_resource_arn, get_service_endpoint
36+
from pcluster.utils import get_attr, get_http_tokens_setting
3737

3838

3939
class QueuesStack(NestedStack):
@@ -184,115 +184,7 @@ def _add_compute_resource_launch_template(
184184
if isinstance(compute_resource, SlurmComputeResource):
185185
conditional_template_properties.update({"instance_type": compute_resource.instance_types[0]})
186186

187-
if queue.instance_profile:
188-
instance_profile_name = get_resource_name_from_resource_arn(queue.instance_profile)
189-
instance_role_name = (
190-
AWSApi.instance()
191-
.iam.get_instance_profile(instance_profile_name)
192-
.get("InstanceProfile")
193-
.get("Roles")[0]
194-
.get("RoleName")
195-
)
196-
elif queue.instance_role:
197-
instance_role_name = get_resource_name_from_resource_arn(queue.instance_role)
198-
else:
199-
instance_role_name = self.managed_compute_instance_roles[queue.name].ref
200-
201187
launch_template_id = f"LaunchTemplate{create_hash_suffix(queue.name + compute_resource.name)}"
202-
launch_template = ec2.CfnLaunchTemplate(
203-
self,
204-
launch_template_id,
205-
launch_template_name=f"{self.stack_name}-{queue.name}-{compute_resource.name}",
206-
launch_template_data=ec2.CfnLaunchTemplate.LaunchTemplateDataProperty(
207-
block_device_mappings=self._launch_template_builder.get_block_device_mappings(
208-
queue.compute_settings.local_storage.root_volume,
209-
AWSApi.instance().ec2.describe_image(self._config.image_dict[queue.name]).device_name,
210-
),
211-
# key_name=,
212-
network_interfaces=compute_lt_nw_interfaces,
213-
placement=ec2.CfnLaunchTemplate.PlacementProperty(group_name=placement_group),
214-
image_id=self._config.image_dict[queue.name],
215-
iam_instance_profile=ec2.CfnLaunchTemplate.IamInstanceProfileProperty(
216-
name=instance_profiles[queue.name]
217-
),
218-
instance_market_options=self._launch_template_builder.get_instance_market_options(
219-
queue, compute_resource
220-
),
221-
instance_initiated_shutdown_behavior="terminate",
222-
capacity_reservation_specification=self._launch_template_builder.get_capacity_reservation(
223-
queue,
224-
compute_resource,
225-
),
226-
metadata_options=ec2.CfnLaunchTemplate.MetadataOptionsProperty(
227-
http_tokens=get_http_tokens_setting(self._config.imds.imds_support)
228-
),
229-
user_data=Fn.base64(
230-
Fn.sub(
231-
get_user_data_content("../resources/compute_node/user_data.sh"),
232-
{
233-
**{
234-
# Disable multithreading using logic from
235-
# https://aws.amazon.com/blogs/compute/disabling-intel-hyper-threading-technology-on-amazon-linux/
236-
# thread_siblings_list contains a comma (,) or dash (-) separated list of CPU hardware
237-
# threads within the same core as cpu
238-
# e.g. 0-1 or 0,1
239-
# cat /sys/devices/system/cpu/cpu*/topology/thread_siblings_list
240-
# | tr '-' ',' # convert hyphen (-) to comma (,), to account that
241-
# # some kernels and CPU architectures use a hyphen
242-
# # instead of a comma
243-
# | cut -s -d, -f2- # split over comma (,) and take the right part
244-
# | tr ',' '\n' # convert remaining comma (,) into new lines
245-
# | sort -un # sort and unique
246-
"DisableMultiThreadingManually": (
247-
"true" if compute_resource.disable_simultaneous_multithreading_manually else "false"
248-
),
249-
"BaseOS": self._config.image.os,
250-
"OSUser": OS_MAPPING[self._config.image.os]["user"],
251-
"ClusterName": self.stack_name,
252-
"Timeout": str(
253-
get_attr(
254-
self._config,
255-
"dev_settings.timeouts.compute_node_bootstrap_timeout",
256-
NODE_BOOTSTRAP_TIMEOUT,
257-
)
258-
),
259-
"ComputeStartupTimeMetricEnabled": str(
260-
get_attr(
261-
self._config,
262-
"dev_settings.compute_startup_time_metric_enabled",
263-
default=False,
264-
)
265-
),
266-
"LaunchTemplateResourceId": launch_template_id,
267-
"CloudFormationUrl": get_service_endpoint("cloudformation", self._config.region),
268-
"CfnInitRole": instance_role_name,
269-
},
270-
**get_common_user_data_env(queue, self._config),
271-
},
272-
)
273-
),
274-
monitoring=ec2.CfnLaunchTemplate.MonitoringProperty(enabled=is_detailed_monitoring_enabled),
275-
tag_specifications=[
276-
ec2.CfnLaunchTemplate.TagSpecificationProperty(
277-
resource_type="instance",
278-
tags=get_default_instance_tags(
279-
self.stack_name, self._config, compute_resource, "Compute", self._shared_storage_infos
280-
)
281-
+ [CfnTag(key=PCLUSTER_QUEUE_NAME_TAG, value=queue.name)]
282-
+ [CfnTag(key=PCLUSTER_COMPUTE_RESOURCE_NAME_TAG, value=compute_resource.name)]
283-
+ self._get_custom_compute_resource_tags(queue, compute_resource),
284-
),
285-
ec2.CfnLaunchTemplate.TagSpecificationProperty(
286-
resource_type="volume",
287-
tags=get_default_volume_tags(self.stack_name, "Compute")
288-
+ [CfnTag(key=PCLUSTER_QUEUE_NAME_TAG, value=queue.name)]
289-
+ [CfnTag(key=PCLUSTER_COMPUTE_RESOURCE_NAME_TAG, value=compute_resource.name)]
290-
+ self._get_custom_compute_resource_tags(queue, compute_resource),
291-
),
292-
],
293-
**conditional_template_properties,
294-
),
295-
)
296188

297189
dna_json = json.dumps(
298190
{
@@ -397,64 +289,99 @@ def _add_compute_resource_launch_template(
397289
"launch_template_id": launch_template_id,
398290
}
399291
},
400-
indent=4,
292+
indent=None, # Keep indent as None for compact sizing and proper parsing in user_data.sh
401293
)
402294

403-
cfn_init = {
404-
"configSets": {
405-
"deployFiles": ["deployConfigFiles"],
406-
"update": ["deployConfigFiles", "chefUpdate"],
407-
},
408-
"deployConfigFiles": {
409-
"files": {
410-
# A nosec comment is appended to the following line in order to disable the B108 check.
411-
# The file is needed by the product
412-
# [B108:hardcoded_tmp_directory] Probable insecure usage of temp file/directory.
413-
"/tmp/dna.json": { # nosec B108
414-
"content": dna_json,
415-
"mode": "000644",
416-
"owner": "root",
417-
"group": "root",
418-
"encoding": "plain",
419-
},
420-
# A nosec comment is appended to the following line in order to disable the B108 check.
421-
# The file is needed by the product
422-
# [B108:hardcoded_tmp_directory] Probable insecure usage of temp file/directory.
423-
"/tmp/extra.json": { # nosec B108
424-
"mode": "000644",
425-
"owner": "root",
426-
"group": "root",
427-
"content": self._config.extra_chef_attributes,
428-
},
429-
},
430-
"commands": {
431-
"mkdir": {"command": "mkdir -p /etc/chef/ohai/hints"},
432-
"touch": {"command": "touch /etc/chef/ohai/hints/ec2.json"},
433-
"jq": {
434-
"command": (
435-
'jq -s ".[0] * .[1]" /tmp/dna.json /tmp/extra.json > /etc/chef/dna.json '
436-
'|| ( echo "jq not installed"; cp /tmp/dna.json /etc/chef/dna.json )'
295+
launch_template = ec2.CfnLaunchTemplate(
296+
self,
297+
launch_template_id,
298+
launch_template_name=f"{self.stack_name}-{queue.name}-{compute_resource.name}",
299+
launch_template_data=ec2.CfnLaunchTemplate.LaunchTemplateDataProperty(
300+
block_device_mappings=self._launch_template_builder.get_block_device_mappings(
301+
queue.compute_settings.local_storage.root_volume,
302+
AWSApi.instance().ec2.describe_image(self._config.image_dict[queue.name]).device_name,
303+
),
304+
network_interfaces=compute_lt_nw_interfaces,
305+
placement=ec2.CfnLaunchTemplate.PlacementProperty(group_name=placement_group),
306+
image_id=self._config.image_dict[queue.name],
307+
iam_instance_profile=ec2.CfnLaunchTemplate.IamInstanceProfileProperty(
308+
name=instance_profiles[queue.name]
309+
),
310+
instance_market_options=self._launch_template_builder.get_instance_market_options(
311+
queue, compute_resource
312+
),
313+
instance_initiated_shutdown_behavior="terminate",
314+
capacity_reservation_specification=self._launch_template_builder.get_capacity_reservation(
315+
queue,
316+
compute_resource,
317+
),
318+
metadata_options=ec2.CfnLaunchTemplate.MetadataOptionsProperty(
319+
http_tokens=get_http_tokens_setting(self._config.imds.imds_support)
320+
),
321+
user_data=Fn.base64(
322+
Fn.sub(
323+
get_user_data_content("../resources/compute_node/user_data.sh"),
324+
{
325+
**{
326+
# Disable multithreading using logic from
327+
# https://aws.amazon.com/blogs/compute/disabling-intel-hyper-threading-technology-on-amazon-linux/
328+
# thread_siblings_list contains a comma (,) or dash (-) separated list of CPU hardware
329+
# threads within the same core as cpu
330+
# e.g. 0-1 or 0,1
331+
# cat /sys/devices/system/cpu/cpu*/topology/thread_siblings_list
332+
# | tr '-' ',' # convert hyphen (-) to comma (,), to account that
333+
# # some kernels and CPU architectures use a hyphen
334+
# # instead of a comma
335+
# | cut -s -d, -f2- # split over comma (,) and take the right part
336+
# | tr ',' '\n' # convert remaining comma (,) into new lines
337+
# | sort -un # sort and unique
338+
"DisableMultiThreadingManually": (
339+
"true" if compute_resource.disable_simultaneous_multithreading_manually else "false"
340+
),
341+
"BaseOS": self._config.image.os,
342+
"ClusterName": self.stack_name,
343+
"Timeout": str(
344+
get_attr(
345+
self._config,
346+
"dev_settings.timeouts.compute_node_bootstrap_timeout",
347+
NODE_BOOTSTRAP_TIMEOUT,
348+
)
349+
),
350+
"ComputeStartupTimeMetricEnabled": str(
351+
get_attr(
352+
self._config,
353+
"dev_settings.compute_startup_time_metric_enabled",
354+
default=False,
355+
)
356+
),
357+
"DnaJson": dna_json,
358+
"ExtraJson": self._config.extra_chef_attributes,
359+
},
360+
**get_common_user_data_env(queue, self._config),
361+
},
362+
)
363+
),
364+
monitoring=ec2.CfnLaunchTemplate.MonitoringProperty(enabled=is_detailed_monitoring_enabled),
365+
tag_specifications=[
366+
ec2.CfnLaunchTemplate.TagSpecificationProperty(
367+
resource_type="instance",
368+
tags=get_default_instance_tags(
369+
self.stack_name, self._config, compute_resource, "Compute", self._shared_storage_infos
437370
)
438-
},
439-
},
440-
},
441-
"chefUpdate": {
442-
"commands": {
443-
"chef": {
444-
"command": (
445-
". /etc/parallelcluster/pcluster_cookbook_environment.sh; "
446-
"cinc-client --local-mode --config /etc/chef/client.rb --log_level info"
447-
" --logfile /var/log/chef-client.log --force-formatter --no-color"
448-
" --chef-zero-port 8889 --json-attributes /etc/chef/dna.json"
449-
" --override-runlist aws-parallelcluster-entrypoints::update &&"
450-
" /opt/parallelcluster/scripts/fetch_and_run -postupdate"
451-
),
452-
"cwd": "/etc/chef",
453-
}
454-
}
455-
},
456-
}
457-
458-
launch_template.add_metadata("AWS::CloudFormation::Init", cfn_init)
371+
+ [CfnTag(key=PCLUSTER_QUEUE_NAME_TAG, value=queue.name)]
372+
+ [CfnTag(key=PCLUSTER_COMPUTE_RESOURCE_NAME_TAG, value=compute_resource.name)]
373+
+ self._get_custom_compute_resource_tags(queue, compute_resource),
374+
),
375+
ec2.CfnLaunchTemplate.TagSpecificationProperty(
376+
resource_type="volume",
377+
tags=get_default_volume_tags(self.stack_name, "Compute")
378+
+ [CfnTag(key=PCLUSTER_QUEUE_NAME_TAG, value=queue.name)]
379+
+ [CfnTag(key=PCLUSTER_COMPUTE_RESOURCE_NAME_TAG, value=compute_resource.name)]
380+
+ self._get_custom_compute_resource_tags(queue, compute_resource),
381+
),
382+
],
383+
**conditional_template_properties,
384+
),
385+
)
459386

460387
return launch_template

cli/tests/pcluster/templates/test_cluster_stack.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -416,8 +416,8 @@ def test_compute_launch_template_properties(
416416
"UserData"
417417
]["Fn::Base64"]["Fn::Sub"][1]
418418
expected_user_data_variables = {
419-
"CloudFormationUrl": "https://cloudformation.us-east-1.amazonaws.com",
420-
"LaunchTemplateResourceId": launch_template_logical_id,
419+
"DisableMultiThreadingManually": "true",
420+
"ProxyServer": "NONE",
421421
}
422422
for k, v in expected_user_data_variables.items():
423423
assert_that(user_data_variables[k]).is_equal_to(v)

0 commit comments

Comments
 (0)