Skip to content

Commit dd83140

Browse files
author
Himani Anil Deshpande
committed
[NVIDIA_IMEX] Add resource to install Nvidia-imex
1 parent 59cce6b commit dd83140

File tree

9 files changed

+403
-0
lines changed

9 files changed

+403
-0
lines changed
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# frozen_string_literal: true
2+
3+
# Copyright:: 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License").
6+
# You may not use this file except in compliance with the License.
7+
# A copy of the License is located at
8+
#
9+
# http://aws.amazon.com/apache2.0/
10+
#
11+
# or in the "LICENSE.txt" file accompanying this file.
12+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
13+
# See the License for the specific language governing permissions and limitations under the License.
14+
15+
provides :nvidia_imex, platform: 'amazon' do |node|
16+
node['platform_version'].to_i == 2023
17+
end
18+
19+
use 'partial/_nvidia_imex_common.rb'
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# frozen_string_literal: true
2+
3+
# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License").
6+
# You may not use this file except in compliance with the License.
7+
# A copy of the License is located at
8+
#
9+
# http://aws.amazon.com/apache2.0/
10+
#
11+
# or in the "LICENSE.txt" file accompanying this file.
12+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
13+
# See the License for the specific language governing permissions and limitations under the License.
14+
15+
provides :nvidia_imex, platform: 'amazon', platform_version: '2'
16+
17+
use 'partial/_nvidia_imex_common.rb'
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# frozen_string_literal: true
2+
3+
# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License").
6+
# You may not use this file except in compliance with the License.
7+
# A copy of the License is located at
8+
#
9+
# http://aws.amazon.com/apache2.0/
10+
#
11+
# or in the "LICENSE.txt" file accompanying this file.
12+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
13+
# See the License for the specific language governing permissions and limitations under the License.
14+
15+
provides :nvidia_imex, platform: 'redhat' do |node|
16+
node['platform_version'].to_i >= 8
17+
end
18+
19+
use 'partial/_nvidia_imex_common.rb'
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# frozen_string_literal: true
2+
3+
# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License").
6+
# You may not use this file except in compliance with the License.
7+
# A copy of the License is located at
8+
#
9+
# http://aws.amazon.com/apache2.0/
10+
#
11+
# or in the "LICENSE.txt" file accompanying this file.
12+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
13+
# See the License for the specific language governing permissions and limitations under the License.
14+
15+
provides :nvidia_imex, platform: 'rocky' do |node|
16+
node['platform_version'].to_i >= 8
17+
end
18+
19+
use 'partial/_nvidia_imex_common.rb'
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# frozen_string_literal: true
2+
3+
# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License").
6+
# You may not use this file except in compliance with the License.
7+
# A copy of the License is located at
8+
#
9+
# http://aws.amazon.com/apache2.0/
10+
#
11+
# or in the "LICENSE.txt" file accompanying this file.
12+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
13+
# See the License for the specific language governing permissions and limitations under the License.
14+
15+
provides :nvidia_imex, platform: 'ubuntu' do |node|
16+
node['platform_version'].to_i >= 22
17+
end
18+
19+
use 'partial/_nvidia_imex_common.rb'
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
# frozen_string_literal: true
2+
#
3+
# Copyright:: 2013-2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License").
6+
# You may not use this file except in compliance with the License.
7+
# A copy of the License is located at
8+
#
9+
# http://aws.amazon.com/apache2.0/
10+
#
11+
# or in the "LICENSE.txt" file accompanying this file.
12+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
13+
# See the License for the specific language governing permissions and limitations under the License.
14+
15+
unified_mode true
16+
default_action :install
17+
18+
action :install do
19+
return unless nvidia_enabled_or_installed?
20+
return if on_docker? || imex_installed
21+
22+
# Add NVIDIA repo for nvidia-imex
23+
nvidia_repo 'add nvidia repository' do
24+
action :add
25+
end
26+
27+
directory "#{node['cluster']['shared_dir']}/nvidia-imex"
28+
29+
template "#{node['cluster']['shared_dir']}/nvidia-imex/config.cfg" do
30+
source 'nvidia-imex/nvidia-imex-config.erb'
31+
owner 'root'
32+
group 'root'
33+
mode '0755'
34+
end
35+
36+
template "#{node['cluster']['shared_dir']}/nvidia-imex/nodes_config.cfg" do
37+
source 'nvidia-imex/nvidia-imex-nodes.erb'
38+
owner 'root'
39+
group 'root'
40+
mode '0755'
41+
end
42+
43+
template "/etc/systemd/system/nvidia-imex.service" do
44+
source 'nvidia-imex/nvidia-imex.service.erb'
45+
owner 'root'
46+
group 'root'
47+
mode '0644'
48+
action :create
49+
end
50+
51+
package 'nvidia-imex' do
52+
retries 3
53+
retry_delay 5
54+
version node['cluster']['nvidia']['driver_version']
55+
end
56+
end
57+
58+
def imex_installed
59+
::File.exist?('/usr/bin/nvidia-imex') || ::File.exist?('/usr/bin/nvidia-imex-ctl')
60+
end
61+
62+
def nvidia_enabled_or_installed?
63+
nvidia_enabled? || nvidia_installed?
64+
end
Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
# NVIDIA IMEX configuration file.
2+
# Note: This configuration file is read during IMEX startup. So, IMEX
3+
# service restart is required for new settings to take effect.
4+
5+
# Description: IMEX logging levels
6+
# Possible Values:
7+
# 0 - All the logging is disabled
8+
# 1 - Set log level to CRITICAL and above
9+
# 2 - Set log level to ERROR and above
10+
# 3 - Set log level to WARNING and above
11+
# 4 - Set log level to INFO and above
12+
# Default Value: 4
13+
LOG_LEVEL=4
14+
15+
# Description: Filename for IMEX logs
16+
# Possible Values:
17+
# Full path/filename string (max length of 256). Logs will be redirected
18+
# to console(stderr). If the specified log file can't be opened or the
19+
# path is empty.
20+
# Default Value: /var/log/nvidia-imex.log
21+
LOG_FILE_NAME=/var/log/nvidia-imex.log
22+
23+
# Description: Filename for IMEX stats logging
24+
# Possible Values:
25+
# Full path/filename string (max length of 256). Stats will be redirected
26+
# to console(stderr), if the specified stats file can't be opened or the
27+
# path is empty.
28+
# Default Value: /var/log/nvidia-imex-stats.log
29+
# Note: If STATS_FILE_NAME is configured same as LOG_FILE_NAME, then stats will
30+
# be redirected to the path/filename specified by LOG_FILE_NAME.
31+
STATS_FILE_NAME=/var/log/nvidia-imex-stats.log
32+
33+
# Description: Append to an existing log file or overwrite the logs
34+
# Possible Values:
35+
# 0 - No (Log file will be overwritten)
36+
# 1 - Yes (Append to existing log)
37+
# Default Value: 1
38+
LOG_APPEND_TO_LOG=1
39+
40+
# Description: Max size of log file (in MB)
41+
# Possible Values:
42+
# Any Integer values
43+
# Default Value: 1024
44+
LOG_FILE_MAX_SIZE=1024
45+
46+
# Description: Number of times the IMEX log is rotated once it reaches LOG_FILE_MAX_SIZE
47+
# Possible Values:
48+
# 0 - Log is not rotated. Logging is stopped once the IMEX log file reaches
49+
# the size specified in LOG_FILE_MAX_SIZE
50+
# Non-zero Integer - Log is rotated upto the number of times specified in LOG_MAX_ROTATE_COUNT,
51+
# after the size of the log file reaches the size specified in LOG_FILE_MAX_SIZE.
52+
# Combined IMEX log size is LOG_FILE_MAX_SIZE multipled by LOG_MAX_ROTATE_COUNT+1
53+
# Once this threshold is reached, the oldest log file is purged and reused.
54+
LOG_MAX_ROTATE_COUNT=3
55+
56+
# Description: Redirect all the logs to syslog instead of logging to file
57+
# Possible Values:
58+
# 0 - No
59+
# 1 - Yes
60+
# Default Value: 0
61+
LOG_USE_SYSLOG=0
62+
63+
# Description: daemonize IMEX on start-up
64+
# Possible Values:
65+
# 0 - No (Do not daemonize and run IMEX as a normal process)
66+
# 1 - Yes (Run IMEX process as Unix daemon
67+
# Default Value: 1
68+
DAEMONIZE=1
69+
70+
# Description: Network interface to listen for IMEX peer communication.
71+
# OPTIONAL - empty value will determine the bind IP from the node config file.
72+
# Possible Values:
73+
# A valid IPv4 address
74+
# A valid IPv6 address
75+
# No value - Determine bind IP from the node configuration file.
76+
# Default Value:
77+
BIND_INTERFACE_IP=
78+
79+
# Description: Starting TCP port number for IMEX peer communication
80+
# Possible Values:
81+
# Any value between 0 and 65535
82+
# Default Value: 50000
83+
SERVER_PORT=50000
84+
85+
# Description: Name of file containing IP addresses of nodes
86+
# Possible Values:
87+
# Full path/filename string (max length of 256).
88+
# Default Value: /etc/nvidia-imex/nodes_config.cfg
89+
IMEX_NODE_CONFIG_FILE=<%= node['cluster']['shared_dir'] %>/nodes_config.cfg
90+
91+
# Description: Name of the network interface used for communication.
92+
# OPTIONAL - If empty, network interface will be determined by matching bind IP to
93+
# node configuration file. Only necessary to configure if the bind IP
94+
# is IPv6 link-local and on multiple network interfaces.
95+
# Possible Values:
96+
# Interface names like eth0, ens32 .. etc
97+
# Default Value:
98+
NETWORK_INTERFACE=
99+
100+
# Description: Controls whether IMEX should complete initialization without establishing quorum
101+
# Possible values:
102+
# NONE: Do not wait for any quorum with other nodes.
103+
# RECOVERY: In case of unsafe IMEX termination, wait until all nodes that had previously imported
104+
# have connected, allowing them time to safely clean up any potentially hanging references
105+
# Default value: RECOVERY
106+
IMEX_WAIT_FOR_QUORUM=RECOVERY
107+
108+
# Description: Enable authentication and encryption between nodes.
109+
# Possible Values:
110+
# 0: Disable encryption and authentication
111+
# 1: Enable encryption and authentication
112+
# Default value: 0
113+
IMEX_ENABLE_AUTH_ENCRYPTION=0
114+
115+
# Description: Controls the security mechanism used by IMEX for authentication and encryption between nodes.
116+
# If IMEX_ENABLE_AUTH_ENCRYPTION is enabled (1), then IMEX_AUTH_ENCRYPTIPON_MODE must be configured
117+
# as one of the supported values. An empty or unexpected value will prevent initialization.
118+
# Possible Values:
119+
# SSL_TLS: Default - Use SSL/mTLS for authentication and encryption.
120+
# GSS_AUTH_ENCRYPT: Use GSSAPI for authentication, integrity and encryption.
121+
# GSS_AUTH_ONLY: Use GSSAPI for authentication and integrity only, encryption will be disabled.
122+
IMEX_AUTH_ENCRYPTION_MODE=SSL_TLS
123+
124+
### This is the beginning of configuration if IMEX_AUTH_ENCRYPTION_MODE=SSL_TLS mode. ###
125+
126+
# Description: This determines how IMEX will try to retrieve the keys, certificates, and certificate
127+
# authority for authentication and encryption.
128+
# If IMEX_AUTH_ENCRYPTION_MODE is SSL_TLS, then IMEX_AUTH_SOURCE must be configured
129+
# as one of the supported values. An empty or unexpected value will prevent initialization.
130+
# Possible Values:
131+
# FILE: The provided values are paths to files on the file system.
132+
# ENV_PATH: The provided values are environment variable names to retrieve, and the values in the
133+
# environment variables are treated as paths to files on the file system.
134+
# ENV_VAL: The provided values are environment variable names to retrieve, and the values in the
135+
# environment variables are treated as the actual values for the key/cert/cert auth.
136+
IMEX_AUTH_SOURCE=
137+
138+
# Description: These fields are interpreted based on how IMEX_AUTH_SOURCE is configured
139+
IMEX_SERVER_KEY=
140+
IMEX_SERVER_CERT=
141+
IMEX_SERVER_CERT_AUTH=
142+
IMEX_CLIENT_KEY=
143+
IMEX_CLIENT_CERT=
144+
IMEX_CLIENT_CERT_AUTH=
145+
146+
# Description: Override the target hostname for authentication of the certificates and keys. This allows
147+
# certificates with common names that do not match the ip addresses provided for the nodes.
148+
# Example:
149+
# If the certificate has the subject:
150+
# "/C=US/ST=CA/L=Santa Clara/O=NVIDIA/OU=Test/CN=localhost"
151+
# The certificate validation will expect the connection hostname to be "localhost", by
152+
# setting IMEX_SECURITY_TARGET_OVERRIDE=localhost you can cause override the connection
153+
# hostname for security purposes to be "localhost", allowing the connection to succeed.
154+
IMEX_SECURITY_TARGET_OVERRIDE=
155+
156+
### This is the end of IMEX SSL_TLS mode config parameters. ###
157+
158+
### This is the beginning of configuration if IMEX_AUTH_ENCRYPTION_MODE=GSS_AUTH_ENCRYPT/GSS_AUTH_ONLY mode. ###
159+
160+
# Description: Service Principal Name to use when IMEX_AUTH_ENCRYPTION_MODE is GSS_AUTH_ENCRYPT or GSS_AUTH_ONLY.
161+
# Default Value: host
162+
IMEX_GSS_SERVICE_NAME=host
163+
164+
# Description: GSSAPI timeout (in sec) when IMEX_AUTH_ENCRYPTION_MODE is GSS_AUTH_ENCRYPT or GSS_AUTH_ONLY.
165+
# Possible Values:
166+
# -1 : Default - Retry indefinitely
167+
# >= 0: Number of seconds to wait before triggering clean up
168+
IMEX_GSS_TIMEOUT_SEC=-1
169+
170+
# Description: GSSAPI retry interval (in sec) when IMEX_AUTH_ENCRYPTION_MODE is GSS_AUTH_ENCRYPT or GSS_AUTH_ONLY.
171+
# Possible Values:
172+
# 5 : Default - Retry every 5 seconds
173+
# >= 0: Number of seconds to wait before retrying
174+
IMEX_GSS_RETRY_INTERVAL_SEC=5
175+
176+
# Description: GSSAPI security context lifetime (in sec) when IMEX_AUTH_ENCRYPTION_MODE is GSS_AUTH_ENCRYPT
177+
# or GSS_AUTH_ONLY.
178+
# Possible Values:
179+
# -1 : Default - Indefinite lifetime (limited by the credential lifetime)
180+
# >= 0: Security context lifetime in seconds
181+
IMEX_GSS_SEC_CONTEXT_LIFETIME_SEC=-1
182+
183+
# Description: Determines IMEX behavior during fatal GSSAPI failures or timeouts, when IMEX_AUTH_ENCRYPTION_MODE
184+
# is GSS_AUTH_ENCRYPT or GSS_AUTH_ONLY.
185+
# Possible Values:
186+
# 1 : Default - Shutdown IMEX daemon
187+
# 0 : Terminate connection to the failing peer node
188+
IMEX_GSS_SHUTDOWN_ON_FAILURE=1
189+
190+
### This is the end of IMEX GSS_AUTH_ENCRYPT/GSS_AUTH_ONLY mode config parameters. ###
191+
192+
# Description: Enabled the command/control service to allow for querying information from the IMEX daemon.
193+
# Must be used with IMEX_CMD_PORT (optionally IMEX_CMD_BIND_INTERFACE_IP) and/or
194+
# IMEX_CMD_UNIX_DOMAIN_PATH
195+
IMEX_CMD_ENABLED=1
196+
197+
# Description: IP address to use to bind the command/control service. Ignored if IMEX_CMD_ENABLED=0
198+
# If empty, (but IMEX_CMD_PORT is specified), it will bind to all available interfaces.
199+
IMEX_CMD_BIND_INTERFACE_IP=
200+
201+
# Description: Port to bind to (in conjunction with IMEX_CMD_BIND_INTERFACE) for the command/control service.
202+
# Ignored if IMEX_CMD_ENABLED=0
203+
IMEX_CMD_PORT=50005
204+
205+
# Description: Unix domain socket path to attach to for the command/control service. Ignored if IMEX_CMD_ENABLED=0
206+
IMEX_CMD_UNIX_DOMAIN_PATH=
207+
208+
# Description: Determines how long to wait after detecting that the IMEX daemon has lost connection to another
209+
# node before triggering clean up imports and exports from that node. If a connection is reestablished
210+
# before the grace period expires, and IMEX is able to identify that it is the same instance previously
211+
# connected, then no clean up is required. If a connection is established and IMEX detects that it is
212+
# a new instance (i.e. someone restarted the IMEX daemon), then clean up will be immediately triggered
213+
# regardless of grace period.
214+
# -1: Default - Wait indefinitely
215+
# 0: Immediately trigger clean up
216+
# >0: Number of seconds to wait before triggering clean up
217+
IMEX_NODE_DISCONNECTED_GRACE_TIME=-1
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
## Please replace below fake IP's
2+
172.31.51.93
3+
172.31.48.43

0 commit comments

Comments
 (0)