Skip to content

Commit 10a5697

Browse files
author
Himani Anil Deshpande
committed
[CN] Removing usage of cfn-init in compute Node bootstrapping
* Using cloud-init native approach for creating files
1 parent e9166c1 commit 10a5697

File tree

2 files changed

+105
-153
lines changed

2 files changed

+105
-153
lines changed

cli/src/pcluster/resources/compute_node/user_data.sh

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,16 @@ datasource_list: [ Ec2, None ]
5151
output:
5252
all: "| tee -a /var/log/cloud-init-output.log | logger -t user-data -s 2>/dev/ttyS0"
5353
write_files:
54+
- path: /tmp/dna.json
55+
permissions: '0644'
56+
owner: root:root
57+
content: |
58+
${DnaJson}
59+
- path: /tmp/extra.json
60+
permissions: '0644'
61+
owner: root:root
62+
content: |
63+
${ExtraJson}
5464
- path: /tmp/bootstrap.sh
5565
permissions: '0744'
5666
owner: root:root
@@ -99,8 +109,6 @@ write_files:
99109

100110
[ -f /etc/profile.d/proxy.sh ] && . /etc/profile.d/proxy.sh
101111

102-
$CFN_BOOTSTRAP_VIRTUALENV_PATH/cfn-init -s ${AWS::StackName} -v -c deployFiles -r ${LaunchTemplateResourceId} --region ${AWS::Region} --url ${CloudFormationUrl} --role ${CfnInitRole} || error_exit 'Failed to bootstrap the compute node. Please check /var/log/cfn-init.log in the compute node or in CloudWatch logs. Please refer to https://docs.aws.amazon.com/parallelcluster/latest/ug/troubleshooting-v3.html#troubleshooting-v3-get-logs for more details on ParallelCluster logs.'
103-
104112
[ -f /etc/profile.d/aws-cli-default-config.sh ] && . /etc/profile.d/aws-cli-default-config.sh
105113

106114
custom_cookbook=${CustomChefCookbook}
@@ -139,9 +147,11 @@ write_files:
139147
vendor_cookbook
140148
fi
141149
cd /tmp
150+
mkdir -p /etc/chef/ohai/hints
151+
touch /etc/chef/ohai/hints/ec2.json
142152

143153
start=$(date +%s)
144-
154+
jq -s ".[0] * .[1]" /tmp/dna.json /tmp/extra.json > /etc/chef/dna.json || ( echo "jq not installed"; cp /tmp/dna.json /etc/chef/dna.json )
145155
{
146156
CINC_CMD="cinc-client --local-mode --config /etc/chef/client.rb --log_level info --logfile /var/log/chef-client.log --force-formatter --no-color --chef-zero-port 8889 --json-attributes /etc/chef/dna.json --override-runlist"
147157
FR_CMD="/opt/parallelcluster/scripts/fetch_and_run"

cli/src/pcluster/templates/queues_stack.py

Lines changed: 92 additions & 150 deletions
Original file line numberDiff line numberDiff line change
@@ -199,100 +199,6 @@ def _add_compute_resource_launch_template(
199199
instance_role_name = self.managed_compute_instance_roles[queue.name].ref
200200

201201
launch_template_id = f"LaunchTemplate{create_hash_suffix(queue.name + compute_resource.name)}"
202-
launch_template = ec2.CfnLaunchTemplate(
203-
self,
204-
launch_template_id,
205-
launch_template_name=f"{self.stack_name}-{queue.name}-{compute_resource.name}",
206-
launch_template_data=ec2.CfnLaunchTemplate.LaunchTemplateDataProperty(
207-
block_device_mappings=self._launch_template_builder.get_block_device_mappings(
208-
queue.compute_settings.local_storage.root_volume,
209-
AWSApi.instance().ec2.describe_image(self._config.image_dict[queue.name]).device_name,
210-
),
211-
# key_name=,
212-
network_interfaces=compute_lt_nw_interfaces,
213-
placement=ec2.CfnLaunchTemplate.PlacementProperty(group_name=placement_group),
214-
image_id=self._config.image_dict[queue.name],
215-
iam_instance_profile=ec2.CfnLaunchTemplate.IamInstanceProfileProperty(
216-
name=instance_profiles[queue.name]
217-
),
218-
instance_market_options=self._launch_template_builder.get_instance_market_options(
219-
queue, compute_resource
220-
),
221-
instance_initiated_shutdown_behavior="terminate",
222-
capacity_reservation_specification=self._launch_template_builder.get_capacity_reservation(
223-
queue,
224-
compute_resource,
225-
),
226-
metadata_options=ec2.CfnLaunchTemplate.MetadataOptionsProperty(
227-
http_tokens=get_http_tokens_setting(self._config.imds.imds_support)
228-
),
229-
user_data=Fn.base64(
230-
Fn.sub(
231-
get_user_data_content("../resources/compute_node/user_data.sh"),
232-
{
233-
**{
234-
# Disable multithreading using logic from
235-
# https://aws.amazon.com/blogs/compute/disabling-intel-hyper-threading-technology-on-amazon-linux/
236-
# thread_siblings_list contains a comma (,) or dash (-) separated list of CPU hardware
237-
# threads within the same core as cpu
238-
# e.g. 0-1 or 0,1
239-
# cat /sys/devices/system/cpu/cpu*/topology/thread_siblings_list
240-
# | tr '-' ',' # convert hyphen (-) to comma (,), to account that
241-
# # some kernels and CPU architectures use a hyphen
242-
# # instead of a comma
243-
# | cut -s -d, -f2- # split over comma (,) and take the right part
244-
# | tr ',' '\n' # convert remaining comma (,) into new lines
245-
# | sort -un # sort and unique
246-
"DisableMultiThreadingManually": (
247-
"true" if compute_resource.disable_simultaneous_multithreading_manually else "false"
248-
),
249-
"BaseOS": self._config.image.os,
250-
"OSUser": OS_MAPPING[self._config.image.os]["user"],
251-
"ClusterName": self.stack_name,
252-
"Timeout": str(
253-
get_attr(
254-
self._config,
255-
"dev_settings.timeouts.compute_node_bootstrap_timeout",
256-
NODE_BOOTSTRAP_TIMEOUT,
257-
)
258-
),
259-
"ComputeStartupTimeMetricEnabled": str(
260-
get_attr(
261-
self._config,
262-
"dev_settings.compute_startup_time_metric_enabled",
263-
default=False,
264-
)
265-
),
266-
"LaunchTemplateResourceId": launch_template_id,
267-
"CloudFormationUrl": get_service_endpoint("cloudformation", self._config.region),
268-
"CfnInitRole": instance_role_name,
269-
},
270-
**get_common_user_data_env(queue, self._config),
271-
},
272-
)
273-
),
274-
monitoring=ec2.CfnLaunchTemplate.MonitoringProperty(enabled=is_detailed_monitoring_enabled),
275-
tag_specifications=[
276-
ec2.CfnLaunchTemplate.TagSpecificationProperty(
277-
resource_type="instance",
278-
tags=get_default_instance_tags(
279-
self.stack_name, self._config, compute_resource, "Compute", self._shared_storage_infos
280-
)
281-
+ [CfnTag(key=PCLUSTER_QUEUE_NAME_TAG, value=queue.name)]
282-
+ [CfnTag(key=PCLUSTER_COMPUTE_RESOURCE_NAME_TAG, value=compute_resource.name)]
283-
+ self._get_custom_compute_resource_tags(queue, compute_resource),
284-
),
285-
ec2.CfnLaunchTemplate.TagSpecificationProperty(
286-
resource_type="volume",
287-
tags=get_default_volume_tags(self.stack_name, "Compute")
288-
+ [CfnTag(key=PCLUSTER_QUEUE_NAME_TAG, value=queue.name)]
289-
+ [CfnTag(key=PCLUSTER_COMPUTE_RESOURCE_NAME_TAG, value=compute_resource.name)]
290-
+ self._get_custom_compute_resource_tags(queue, compute_resource),
291-
),
292-
],
293-
**conditional_template_properties,
294-
),
295-
)
296202

297203
dna_json = json.dumps(
298204
{
@@ -397,64 +303,100 @@ def _add_compute_resource_launch_template(
397303
"launch_template_id": launch_template_id,
398304
}
399305
},
400-
indent=4,
306+
indent=None, # Keep indent as None for compact sizing and proper parsing in user_data.sh
401307
)
402308

403-
cfn_init = {
404-
"configSets": {
405-
"deployFiles": ["deployConfigFiles"],
406-
"update": ["deployConfigFiles", "chefUpdate"],
407-
},
408-
"deployConfigFiles": {
409-
"files": {
410-
# A nosec comment is appended to the following line in order to disable the B108 check.
411-
# The file is needed by the product
412-
# [B108:hardcoded_tmp_directory] Probable insecure usage of temp file/directory.
413-
"/tmp/dna.json": { # nosec B108
414-
"content": dna_json,
415-
"mode": "000644",
416-
"owner": "root",
417-
"group": "root",
418-
"encoding": "plain",
419-
},
420-
# A nosec comment is appended to the following line in order to disable the B108 check.
421-
# The file is needed by the product
422-
# [B108:hardcoded_tmp_directory] Probable insecure usage of temp file/directory.
423-
"/tmp/extra.json": { # nosec B108
424-
"mode": "000644",
425-
"owner": "root",
426-
"group": "root",
427-
"content": self._config.extra_chef_attributes,
428-
},
429-
},
430-
"commands": {
431-
"mkdir": {"command": "mkdir -p /etc/chef/ohai/hints"},
432-
"touch": {"command": "touch /etc/chef/ohai/hints/ec2.json"},
433-
"jq": {
434-
"command": (
435-
'jq -s ".[0] * .[1]" /tmp/dna.json /tmp/extra.json > /etc/chef/dna.json '
436-
'|| ( echo "jq not installed"; cp /tmp/dna.json /etc/chef/dna.json )'
309+
launch_template = ec2.CfnLaunchTemplate(
310+
self,
311+
launch_template_id,
312+
launch_template_name=f"{self.stack_name}-{queue.name}-{compute_resource.name}",
313+
launch_template_data=ec2.CfnLaunchTemplate.LaunchTemplateDataProperty(
314+
block_device_mappings=self._launch_template_builder.get_block_device_mappings(
315+
queue.compute_settings.local_storage.root_volume,
316+
AWSApi.instance().ec2.describe_image(self._config.image_dict[queue.name]).device_name,
317+
),
318+
# key_name=,
319+
network_interfaces=compute_lt_nw_interfaces,
320+
placement=ec2.CfnLaunchTemplate.PlacementProperty(group_name=placement_group),
321+
image_id=self._config.image_dict[queue.name],
322+
iam_instance_profile=ec2.CfnLaunchTemplate.IamInstanceProfileProperty(
323+
name=instance_profiles[queue.name]
324+
),
325+
instance_market_options=self._launch_template_builder.get_instance_market_options(
326+
queue, compute_resource
327+
),
328+
instance_initiated_shutdown_behavior="terminate",
329+
capacity_reservation_specification=self._launch_template_builder.get_capacity_reservation(
330+
queue,
331+
compute_resource,
332+
),
333+
metadata_options=ec2.CfnLaunchTemplate.MetadataOptionsProperty(
334+
http_tokens=get_http_tokens_setting(self._config.imds.imds_support)
335+
),
336+
user_data=Fn.base64(
337+
Fn.sub(
338+
get_user_data_content("../resources/compute_node/user_data.sh"),
339+
{
340+
**{
341+
# Disable multithreading using logic from
342+
# https://aws.amazon.com/blogs/compute/disabling-intel-hyper-threading-technology-on-amazon-linux/
343+
# thread_siblings_list contains a comma (,) or dash (-) separated list of CPU hardware
344+
# threads within the same core as cpu
345+
# e.g. 0-1 or 0,1
346+
# cat /sys/devices/system/cpu/cpu*/topology/thread_siblings_list
347+
# | tr '-' ',' # convert hyphen (-) to comma (,), to account that
348+
# # some kernels and CPU architectures use a hyphen
349+
# # instead of a comma
350+
# | cut -s -d, -f2- # split over comma (,) and take the right part
351+
# | tr ',' '\n' # convert remaining comma (,) into new lines
352+
# | sort -un # sort and unique
353+
"DisableMultiThreadingManually": (
354+
"true" if compute_resource.disable_simultaneous_multithreading_manually else "false"
355+
),
356+
"BaseOS": self._config.image.os,
357+
"ClusterName": self.stack_name,
358+
"Timeout": str(
359+
get_attr(
360+
self._config,
361+
"dev_settings.timeouts.compute_node_bootstrap_timeout",
362+
NODE_BOOTSTRAP_TIMEOUT,
363+
)
364+
),
365+
"ComputeStartupTimeMetricEnabled": str(
366+
get_attr(
367+
self._config,
368+
"dev_settings.compute_startup_time_metric_enabled",
369+
default=False,
370+
)
371+
),
372+
"DnaJson": dna_json,
373+
"ExtraJson": self._config.extra_chef_attributes,
374+
},
375+
**get_common_user_data_env(queue, self._config),
376+
},
377+
)
378+
),
379+
monitoring=ec2.CfnLaunchTemplate.MonitoringProperty(enabled=is_detailed_monitoring_enabled),
380+
tag_specifications=[
381+
ec2.CfnLaunchTemplate.TagSpecificationProperty(
382+
resource_type="instance",
383+
tags=get_default_instance_tags(
384+
self.stack_name, self._config, compute_resource, "Compute", self._shared_storage_infos
437385
)
438-
},
439-
},
440-
},
441-
"chefUpdate": {
442-
"commands": {
443-
"chef": {
444-
"command": (
445-
". /etc/parallelcluster/pcluster_cookbook_environment.sh; "
446-
"cinc-client --local-mode --config /etc/chef/client.rb --log_level info"
447-
" --logfile /var/log/chef-client.log --force-formatter --no-color"
448-
" --chef-zero-port 8889 --json-attributes /etc/chef/dna.json"
449-
" --override-runlist aws-parallelcluster-entrypoints::update &&"
450-
" /opt/parallelcluster/scripts/fetch_and_run -postupdate"
451-
),
452-
"cwd": "/etc/chef",
453-
}
454-
}
455-
},
456-
}
457-
458-
launch_template.add_metadata("AWS::CloudFormation::Init", cfn_init)
386+
+ [CfnTag(key=PCLUSTER_QUEUE_NAME_TAG, value=queue.name)]
387+
+ [CfnTag(key=PCLUSTER_COMPUTE_RESOURCE_NAME_TAG, value=compute_resource.name)]
388+
+ self._get_custom_compute_resource_tags(queue, compute_resource),
389+
),
390+
ec2.CfnLaunchTemplate.TagSpecificationProperty(
391+
resource_type="volume",
392+
tags=get_default_volume_tags(self.stack_name, "Compute")
393+
+ [CfnTag(key=PCLUSTER_QUEUE_NAME_TAG, value=queue.name)]
394+
+ [CfnTag(key=PCLUSTER_COMPUTE_RESOURCE_NAME_TAG, value=compute_resource.name)]
395+
+ self._get_custom_compute_resource_tags(queue, compute_resource),
396+
),
397+
],
398+
**conditional_template_properties,
399+
),
400+
)
459401

460402
return launch_template

0 commit comments

Comments
 (0)