Skip to content

Commit a0dab73

Browse files
Updating on_create.sh to install and configure EFA client for FSx. Making it consistent to Slurm, which now supports EFA for fsx. (#920)
The change is checking and making sure the OS is supported for EFA backed FSx and the instance has EFA available before proceeding with client installation. Since fsx is mounted later with eks, we can't verify if fsx is efa enabled or not beforehand, or if the fsx and instance are in the same AZ, but in that case, fsx will automatically fall back to use TCP instead of EFA, so, there is no drawback in installing the client at provisioning time.
1 parent 29b0f12 commit a0dab73

File tree

1 file changed

+120
-0
lines changed
  • 1.architectures/7.sagemaker-hyperpod-eks/LifecycleScripts/base-config

1 file changed

+120
-0
lines changed

1.architectures/7.sagemaker-hyperpod-eks/LifecycleScripts/base-config/on_create.sh

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,5 +109,125 @@ else
109109
logger "/opt/sagemaker not mounted. Skipping containerd configuration"
110110
fi
111111

112+
# ===== EFA FSx LUSTRE CLIENT SETUP =====
113+
114+
setup_efa_fsx_client() {
115+
logger "[INFO] Starting EFA FSx client setup"
116+
117+
# Step 1: OS compatibility check
118+
source /etc/os-release 2>/dev/null || { logger "[INFO] Cannot detect OS, skipping"; return 0; }
119+
120+
case "$ID-$VERSION_ID" in
121+
"amzn-2023")
122+
logger "[INFO] Amazon Linux 2023 - supported" ;;
123+
"rhel-9."[5-9]* | "rhel-1"[0-9]*)
124+
logger "[INFO] RHEL $VERSION_ID - supported" ;;
125+
"ubuntu-22.04" | "ubuntu-2"[3-9]*)
126+
# Proper kernel version check for Ubuntu
127+
local kernel_major=$(uname -r | cut -d'.' -f1)
128+
local kernel_minor=$(uname -r | cut -d'.' -f2)
129+
if [[ "$kernel_major" -gt 6 ]] || [[ "$kernel_major" -eq 6 && "$kernel_minor" -ge 8 ]]; then
130+
logger "[INFO] Ubuntu $VERSION_ID kernel ${kernel_major}.${kernel_minor} - supported"
131+
else
132+
logger "[INFO] Ubuntu needs kernel 6.8+, found ${kernel_major}.${kernel_minor}, skipping"
133+
return 0
134+
fi ;;
135+
*)
136+
logger "[INFO] OS $ID $VERSION_ID not supported, skipping"
137+
return 0 ;;
138+
esac
139+
140+
# Step 2: EFA availability check
141+
if [[ ! -x "/opt/amazon/efa/bin/fi_info" ]]; then
142+
logger "[INFO] EFA tools not found, skipping"
143+
return 0
144+
fi
145+
146+
if ! /opt/amazon/efa/bin/fi_info -p efa >/dev/null 2>&1; then
147+
logger "[INFO] EFA not available on this instance, skipping"
148+
return 0
149+
fi
150+
151+
logger "[INFO] EFA detected - configuring for FSx Lustre"
152+
153+
# Step 3: Download and setup
154+
cd /tmp || { logger "[ERROR] Cannot access /tmp directory"; return 1; }
155+
156+
logger "[INFO] Downloading EFA FSx client setup..."
157+
if ! curl --fail --silent --show-error --max-time 30 -o efa-setup.zip \
158+
"https://docs.aws.amazon.com/fsx/latest/LustreGuide/samples/configure-efa-fsx-lustre-client.zip"; then
159+
logger "[ERROR] Download failed"
160+
return 1
161+
fi
162+
163+
logger "[INFO] Extracting setup files..."
164+
if ! unzip -q efa-setup.zip; then
165+
logger "[ERROR] Extract failed"
166+
rm -f efa-setup.zip
167+
return 1
168+
fi
169+
170+
if [[ ! -f "configure-efa-fsx-lustre-client/setup.sh" ]]; then
171+
logger "[ERROR] Setup script not found in package"
172+
rm -rf configure-efa-fsx-lustre-client* efa-setup.zip
173+
return 1
174+
fi
175+
176+
chmod +x configure-efa-fsx-lustre-client/setup.sh
177+
178+
logger "[INFO] Running EFA FSx client setup..."
179+
if ./configure-efa-fsx-lustre-client/setup.sh; then
180+
logger "[SUCCESS] EFA FSx client configured successfully"
181+
else
182+
logger "[ERROR] EFA FSx client setup failed"
183+
rm -rf configure-efa-fsx-lustre-client* efa-setup.zip
184+
return 1
185+
fi
186+
187+
# Cleanup
188+
rm -rf configure-efa-fsx-lustre-client* efa-setup.zip
189+
return 0
190+
}
191+
192+
# Load Lustre modules
193+
load_lustre_modules() {
194+
logger "[INFO] Loading Lustre kernel modules"
195+
196+
# Load lnet module
197+
if modprobe lnet 2>/dev/null; then
198+
logger "[INFO] lnet module loaded"
199+
else
200+
logger "[WARN] lnet module load failed or already loaded"
201+
fi
202+
203+
# Load lustre module
204+
if modprobe lustre 2>/dev/null; then
205+
logger "[INFO] lustre module loaded"
206+
else
207+
logger "[WARN] lustre module load failed or already loaded"
208+
fi
209+
210+
# Initialize LNet network
211+
if command -v lctl >/dev/null 2>&1; then
212+
if lctl network up 2>/dev/null; then
213+
logger "[INFO] LNet network initialized"
214+
else
215+
logger "[INFO] LNet network already active or initialization attempted"
216+
fi
217+
fi
218+
}
219+
220+
# Execute EFA FSx client setup
221+
if setup_efa_fsx_client; then
222+
logger "[INFO] EFA FSx client setup completed successfully"
223+
else
224+
logger "[INFO] EFA FSx client setup skipped or failed - continuing with standard Lustre"
225+
fi
226+
227+
# Load Lustre modules (always execute)
228+
load_lustre_modules
229+
230+
logger "[INFO] FSx client setup complete"
231+
112232
logger "no more steps to run"
113233
logger "[stop] on_create.sh"

0 commit comments

Comments
 (0)