From e06c732f2479621035f87e2de502db225a569baf Mon Sep 17 00:00:00 2001 From: Hanwen Date: Mon, 10 Feb 2025 13:48:58 -0800 Subject: [PATCH] Add instance id and instance type to slurmd The information will be shown on `scontrol show nodes`: ``` NodeName=queue-on-demand-dy-compute-resource-2-2 Arch=x86_64 CoresPerSocket=1 CPUAlloc=4 CPUEfctv=4 CPUTot=4 CPULoad=0.95 AvailableFeatures=dynamic,c5.xlarge,compute-resource-2 ActiveFeatures=dynamic,c5.xlarge,compute-resource-2 Gres=(null) NodeAddr=192.168.127.110 NodeHostName=queue-on-demand-dy-compute-resource-2-2 Version=24.05.2 OS=Linux 5.10.233-224.894.amzn2.x86_64 #1 SMP Mon Jan 27 16:52:48 UTC 2025 RealMemory=7782 AllocMem=0 FreeMem=6431 Sockets=4 Boards=1 State=ALLOCATED+CLOUD ThreadsPerCore=1 TmpDisk=0 Weight=1000 Owner=N/A MCS_label=N/A Partitions=queue-on-demand BootTime=2025-02-10T21:22:00 SlurmdStartTime=2025-02-10T21:25:05 LastBusyTime=2025-02-10T21:25:05 ResumeAfterTime=None CfgTRES=cpu=4,mem=7782M,billing=4 AllocTRES=cpu=4 CurrentWatts=0 AveWatts=0 InstanceId=i-0eb8d995282xxxx11 InstanceType=c5.xlarge ``` reference: https://slurm.schedmd.com/slurmd.html Signed-off-by: Hanwen --- CHANGELOG.md | 1 + .../spec/unit/recipes/config_slurmd_systemd_service_spec.rb | 6 +++++- .../templates/default/slurm/compute/slurmd.service.erb | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 78f234f2b6..01b61fd75e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Upgrade Pmix to 5.0.6 (from 5.0.3). - Upgrade ARM PL to version 24.10 (from 23.10). - Remove generation of DSA keys for login nodes as DSA, which became unsupported in OpenSSH 9.7+. +- Set instance ID and instance type information in Slurm upon compute nodes launch. 3.12.0 ------ diff --git a/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/config_slurmd_systemd_service_spec.rb b/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/config_slurmd_systemd_service_spec.rb index a81bcce6d1..a2f034eba8 100644 --- a/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/config_slurmd_systemd_service_spec.rb +++ b/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/config_slurmd_systemd_service_spec.rb @@ -21,7 +21,11 @@ for_all_oses do |platform, version| context "on #{platform}#{version}" do cached(:chef_run) do - runner(platform: platform, version: version).converge(described_recipe) + runner = runner(platform: platform, version: version) do |node| + node.override['ec2']['instance_id'] = "i-xxx" + node.override['ec2']['instance_type'] = "fake-instance-type" + end + runner.converge(described_recipe) end it 'creates the service definition for slurmd' do diff --git a/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/compute/slurmd.service.erb b/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/compute/slurmd.service.erb index da6231a0b7..12ac5c854d 100644 --- a/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/compute/slurmd.service.erb +++ b/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/compute/slurmd.service.erb @@ -7,7 +7,7 @@ ConditionPathExists=<%= node['cluster']['slurm']['install_dir'] %>/etc/slurm.con [Service] Type=simple EnvironmentFile=-/etc/sysconfig/slurmd -ExecStart=<%= node['cluster']['slurm']['install_dir'] %>/sbin/slurmd -D -s $SLURMD_OPTIONS +ExecStart=<%= node['cluster']['slurm']['install_dir'] %>/sbin/slurmd -D -s $SLURMD_OPTIONS --instance-id <%= node['ec2']['instance_id'] %> --instance-type <%= node['ec2']['instance_type'] %> ExecReload=/bin/kill -HUP $MAINPID KillMode=process LimitNOFILE=131072