You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
A bash script that automates the manual cluster creation process for SageMaker HyperPod SLURM
1
+
# SageMaker Hyperpod Cluster Automation Script
3
2
4
-
This automates the steps from the [SageMaker HyperPod SLURM Workshop](https://catalog.workshops.aws/sagemaker-hyperpod/en-US)
3
+
This project provides a script to automate the creation and setup of a SageMaker Hyperpod cluster with SLURM integration.
5
4
6
-
## 🚀 Installation and Usage
7
-
Using this script is very simple. Run ```bash automate-cluster-creation.sh```
5
+
The automation script streamlines the process of setting up a distributed training environment using AWS SageMaker Hyperpod.
6
+
It handles the installation and configuration of necessary tools, clones the required repository, sets up environment variables, and configures lifecycle scripts for the SageMaker Hyperpod architecture.
8
7
9
-
The script will walk you through creating the cluster configuration for your SageMaker HyperPod Slurm cluster. Please read through the instructions provided while running the script for the best experience.
echo -e "${YELLOW}Training Plan Availability Zone:${NC}$TRAINING_PLAN_AZ"
431
+
echo -e "${YELLOW}Training Plan Instance Type:${NC}$TP_INSTANCE_TYPE"
432
+
fi
433
+
434
+
# Compare INSTANCE_COUNT with AVAILABLE_INSTANCE_COUNT
435
+
INSTANCE_COUNT_OK="n"
436
+
if [[ $INSTANCE_COUNT-gt$AVAILABLE_INSTANCE_COUNT ]];then
437
+
echo -e "${YELLOW}Warning: The requested instance count ($INSTANCE_COUNT) is greater than the available instances in the training plan ($AVAILABLE_INSTANCE_COUNT).${NC}"
438
+
echo -e "${BLUE}Do you want to continue anyway?(yes/no)${NC}"
439
+
read -e CONTINUE
440
+
if [[ $CONTINUE!="yes" ]];then
441
+
NEW_INSTANCE_COUNT=$(get_input "Enter the new number of instances""1")
442
+
# Update INSTANCE_GROUPS with new INSTANCE_COUNT for the current worker group
echo -e "${GREEN}Updated instance count for worker-group-$WORKER_GROUP_COUNT to $INSTANCE_COUNT${NC}"
459
+
fi
460
+
INSTANCE_COUNT_OK="y"
461
+
else
462
+
INSTANCE_COUNT_OK="y"
463
+
fi
464
+
465
+
if [[ $INSTANCE_COUNT_OK=="y" ]];then
466
+
INSTANCE_TYPE_OK="n"
467
+
# Compare INSTANCE_TYPE with TP_INSTANCE_TYPE
468
+
if [[ $INSTANCE_TYPE!=$TP_INSTANCE_TYPE ]];then
469
+
echo -e "${YELLOW}Warning: The requested instance type ($INSTANCE_TYPE) does not match the instance type in the training plan ($TP_INSTANCE_TYPE).${NC}"
470
+
echo -e "${BLUE}Do you want to continue anyway? If you choose "no", then the script will update instance type for you and proceed. (yes/no)${NC}"
471
+
read -e CONTINUE
472
+
if [[ $CONTINUE!="yes" ]];then
473
+
NEW_INSTANCE_TYPE=$TP_INSTANCE_TYPE
474
+
# Update INSTANCE_GROUPS with new INSTANCE_TYPE for the current worker group
echo -e "${GREEN}Updated instance type for worker-group-$WORKER_GROUP_COUNT to $INSTANCE_TYPE${NC}"
491
+
fi
492
+
INSTANCE_TYPE_OK="y"
493
+
else
494
+
INSTANCE_TYPE_OK="y"
495
+
fi
496
+
497
+
if [[ $INSTANCE_TYPE_OK=="y" ]];then
498
+
# Compare TRAINING_PLAN_AZ with CF_AZ
499
+
if [[ $TRAINING_PLAN_AZ!=$CF_AZ ]];then
500
+
echo -e "${YELLOW}Warning: The training plan availability zone ($TRAINING_PLAN_AZ) does not match the cluster availability zone ($CF_AZ).${NC}"
501
+
echo -e "${BLUE}Do you want to continue anyway? (yes/no)${NC}"
502
+
read -e CONTINUE
503
+
if [[ $CONTINUE!="yes" ]];then
504
+
echo -e "${YELLOW}Please ensure that your VPC is in the same Availability Zone as your training plan (or vice versa). If you used the workshop, this should be the CF stack \"sagemaker-hyperpod\". Exiting training plan configuration.${NC}"
505
+
continue
506
+
fi
507
+
fi
508
+
fi
509
+
fi
510
+
511
+
echo -e "${GREEN}Adding Training Plan ARN to instance group configuration.${NC}"
0 commit comments