|
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | + |
| 4 | +# This script provision a minimal, non-production ready, EKS cluster with two nodes and karpenter configuration. The script install karpenter and Kuberay operator on the cluster. |
| 5 | + |
| 6 | + |
| 7 | +# Check if the script has passed CLUSTER_NAME and REGION as arguments |
| 8 | +# if [[ $# -ne 2 ]]; then |
| 9 | +# echo "Usage: $0 <param1> <param2>" |
| 10 | +# exit 1 |
| 11 | +# fi |
| 12 | + |
| 13 | + |
| 14 | + |
| 15 | +REGION=us-east-1 |
| 16 | +CLUSTER_NAME=llm-eks-cluster |
| 17 | + |
| 18 | + |
| 19 | +# Check if jq is installed |
| 20 | +if ! command -v jq &> /dev/null; then |
| 21 | + echo "jq is not installed. Please install jq and try again." |
| 22 | + exit 1 |
| 23 | +fi |
| 24 | + |
| 25 | +# Check if yq is installed |
| 26 | +if ! command -v yq &> /dev/null; then |
| 27 | + echo "yq is not installed. Please install yq and try again." |
| 28 | + exit 1 |
| 29 | +fi |
| 30 | + |
| 31 | +if ! command -v eksctl &> /dev/null; then |
| 32 | + echo "eksctl is not installed. Please install eksctl and eksctl again." |
| 33 | + exit 1 |
| 34 | +fi |
| 35 | + |
| 36 | + |
| 37 | +# Check if cluster exists and delete if it does |
| 38 | +if aws eks describe-cluster --name "${CLUSTER_NAME}" --region "${REGION}" >/dev/null 2>&1; then |
| 39 | + echo "Found existing cluster ${CLUSTER_NAME} in region ${REGION}, proceeding with deletion..." |
| 40 | + eksctl delete cluster --name "${CLUSTER_NAME}" --region "${REGION}" --wait |
| 41 | + echo "Cluster deletion completed" |
| 42 | +else |
| 43 | + echo "No existing cluster named ${CLUSTER_NAME} in region ${REGION}, skipping deletion" |
| 44 | +fi |
| 45 | + |
| 46 | +# Check if cluster exists and delete if it does |
| 47 | +if aws eks describe-cluster --name "${CLUSTER_NAME}" --region "${REGION}" >/dev/null 2>&1; then |
| 48 | + echo "Found existing cluster ${CLUSTER_NAME} in region ${REGION}, proceeding with deletion..." |
| 49 | + |
| 50 | + # Delete the CloudFormation stack directly |
| 51 | + STACK_NAME="eksctl-${CLUSTER_NAME}-cluster" |
| 52 | + echo "Deleting CloudFormation stack: ${STACK_NAME}" |
| 53 | + aws cloudformation delete-stack --stack-name "${STACK_NAME}" |
| 54 | + echo "Waiting for stack deletion to complete..." |
| 55 | + aws cloudformation wait stack-delete-complete --stack-name "${STACK_NAME}" |
| 56 | + echo "Stack deletion completed" |
| 57 | +else |
| 58 | + echo "No existing cluster named ${CLUSTER_NAME} in region ${REGION}, skipping deletion" |
| 59 | +fi |
| 60 | + |
| 61 | + |
| 62 | + |
| 63 | +ALLOWED_VPC=$(aws service-quotas get-service-quota --service-code vpc --quota-code L-F678F1CE | jq '.Quota.Value | floor') |
| 64 | +CONSUMED_VPC=$(aws ec2 describe-vpcs --query "Vpcs" --output json | jq '. | length') |
| 65 | +if [[ $CONSUMED_VPC -ge $ALLOWED_VPC ]]; then |
| 66 | + echo "You have reached the limit of VPCs in your account. This script will need to create a new VPC! Exiting ..." |
| 67 | + exit 1 |
| 68 | +else |
| 69 | + echo "You have $CONSUMED_VPC VPCs in your account. You can create $((ALLOWED_VPC - CONSUMED_VPC)) more VPCs." |
| 70 | +fi |
| 71 | + |
| 72 | +# get user ARN and store it in a variable |
| 73 | +# IAM_PRINCIPAL_ARN=$(aws sts get-caller-identity --query "Arn" --output text) |
| 74 | +IAM_ACCOUNT_ID=$(aws sts get-caller-identity --query "Account" --output text) |
| 75 | +IAM_PRINCIPAL_ARN=arn:aws:sts::${IAM_ACCOUNT_ID}:assumed-role/Admin/{{SessionName}} |
| 76 | +IAM_PRINCIPAL_ROLE_ARN=arn:aws:iam::${IAM_ACCOUNT_ID}:role/Admin |
| 77 | + |
| 78 | +# validate and fail if there is no user ARN |
| 79 | +if [[ -z $IAM_PRINCIPAL_ARN ]]; then |
| 80 | + echo "Failed to get the user ARN. Make sure that you are logged in to the AWS CLI with the correct user." |
| 81 | + exit 1 |
| 82 | +fi |
| 83 | + |
| 84 | + |
| 85 | +# create cluster using eksctl |
| 86 | + |
| 87 | +eksctl create cluster --name=$CLUSTER_NAME --version 1.31 --nodes=2 --region=$REGION --managed --auto-kubeconfig |
| 88 | + |
| 89 | +# add addons |
| 90 | +# Check and install VPC-CNI |
| 91 | +if ! aws eks describe-addon --cluster-name $CLUSTER_NAME --addon-name vpc-cni >/dev/null 2>&1; then |
| 92 | + echo "Installing vpc-cni addon..." |
| 93 | + aws eks create-addon --cluster-name $CLUSTER_NAME --addon-name vpc-cni |
| 94 | +else |
| 95 | + echo "vpc-cni addon already exists" |
| 96 | +fi |
| 97 | +aws eks describe-addon --addon-name vpc-cni --cluster-name $CLUSTER_NAME |
| 98 | + |
| 99 | +# Check and install CoreDNS |
| 100 | +if ! aws eks describe-addon --cluster-name $CLUSTER_NAME --addon-name coredns >/dev/null 2>&1; then |
| 101 | + echo "Installing coredns addon..." |
| 102 | + aws eks create-addon --cluster-name $CLUSTER_NAME --addon-name coredns |
| 103 | +else |
| 104 | + echo "coredns addon already exists" |
| 105 | +fi |
| 106 | +aws eks describe-addon --addon-name coredns --cluster-name $CLUSTER_NAME |
| 107 | + |
| 108 | +# Check and install Pod Identity Agent |
| 109 | +if ! aws eks describe-addon --cluster-name $CLUSTER_NAME --addon-name eks-pod-identity-agent >/dev/null 2>&1; then |
| 110 | + echo "Installing eks-pod-identity-agent addon..." |
| 111 | + aws eks create-addon --cluster-name $CLUSTER_NAME --addon-name eks-pod-identity-agent |
| 112 | +else |
| 113 | + echo "eks-pod-identity-agent addon already exists" |
| 114 | +fi |
| 115 | +aws eks describe-addon --addon-name eks-pod-identity-agent --cluster-name $CLUSTER_NAME |
| 116 | + |
| 117 | + |
| 118 | + |
| 119 | +# list access policies |
| 120 | +aws eks list-access-policies |
| 121 | + |
| 122 | +aws eks update-cluster-config --name $CLUSTER_NAME --access-config authenticationMode=API |
| 123 | + |
| 124 | + |
| 125 | +# Add an access entry to allow current AWS user to be ClusterAdmin access policy |
| 126 | +#aws eks create-access-entry --cluster-name $CLUSTER_NAME --principal-arn $IAM_PRINCIPAL_ROLE_ARN |
| 127 | +#aws eks associate-access-policy --cluster-name $CLUSTER_NAME --principal-arn $IAM_PRINCIPAL_ARN --policy-arn arn:aws:eks::aws:cluster-access-policy/AmazonEKSClusterAdminPolicy --access-scope type=cluster |
| 128 | + |
| 129 | + |
| 130 | +aws eks update-kubeconfig --name $CLUSTER_NAME --region $REGION |
| 131 | +kubectl -n kube-system get pods |
| 132 | + |
| 133 | +#install cert manager |
| 134 | +echo "Adding Cert Manager ..." |
| 135 | +kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.15.3/cert-manager.yaml |
| 136 | + |
| 137 | +echo "Base EKS Setup completed" |
| 138 | + |
| 139 | + |
| 140 | + |
| 141 | + |
| 142 | +# deploy Karpenter on EKS |
| 143 | +echo "Adding Karpenter ..." |
| 144 | + |
| 145 | +export KARPENTER_NAMESPACE="kube-system" |
| 146 | +export KARPENTER_VERSION="1.0.3" |
| 147 | +export K8S_VERSION="1.31" |
| 148 | +export AWS_PARTITION="aws" # if you are not using standard partitions, you may need to configure to aws-cn / aws-us-gov |
| 149 | +export AWS_DEFAULT_REGION=${REGION} |
| 150 | +export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)" |
| 151 | +export TEMPOUT="$(mktemp)" |
| 152 | + |
| 153 | + |
| 154 | +KARPENTER_ROLE_NAME="FMKarpenterRole" |
| 155 | +KARPENTER_POLICY_NAME="FMKARPENTERPolicy" |
| 156 | + |
| 157 | +export KARPENTER_ROLE_NAME |
| 158 | + |
| 159 | +# Create the IAM role |
| 160 | +aws iam create-role --role-name $KARPENTER_ROLE_NAME --assume-role-policy-document '{ |
| 161 | + "Version": "2012-10-17", |
| 162 | + "Statement": [{ |
| 163 | + "Effect": "Allow", |
| 164 | + "Principal": { |
| 165 | + "Service": "pods.eks.amazonaws.com" |
| 166 | + }, |
| 167 | + "Action": ["sts:AssumeRole","sts:TagSession"] |
| 168 | + }] |
| 169 | +}' |
| 170 | + |
| 171 | +# Create the policy |
| 172 | +# https://karpenter.sh/docs/getting-started/migrating-from-cas/ |
| 173 | +# this is useful too |
| 174 | +aws iam create-policy --policy-name $KARPENTER_POLICY_NAME --policy-document '{ |
| 175 | + "Version": "2012-10-17", |
| 176 | + "Statement": [ |
| 177 | + { |
| 178 | + "Effect": "Allow", |
| 179 | + "Action": [ |
| 180 | + "ssm:GetParameter", |
| 181 | + "ec2:DescribeImages", |
| 182 | + "ec2:RunInstances", |
| 183 | + "ec2:DescribeSubnets", |
| 184 | + "ec2:DescribeSecurityGroups", |
| 185 | + "ec2:DescribeLaunchTemplates", |
| 186 | + "ec2:DescribeInstances", |
| 187 | + "ec2:DescribeInstanceTypes", |
| 188 | + "ec2:DescribeInstanceTypeOfferings", |
| 189 | + "ec2:DeleteLaunchTemplate", |
| 190 | + "ec2:CreateTags", |
| 191 | + "ec2:CreateLaunchTemplate", |
| 192 | + "ec2:CreateFleet", |
| 193 | + "ec2:DescribeSpotPriceHistory", |
| 194 | + "pricing:GetProducts", |
| 195 | + "ec2:TerminateInstances", |
| 196 | + "ec2:CreateTags", |
| 197 | + "iam:PassRole", |
| 198 | + "eks:DescribeCluster", |
| 199 | + "iam:CreateInstanceProfile", |
| 200 | + "iam:TagInstanceProfile", |
| 201 | + "iam:AddRoleToInstanceProfile", |
| 202 | + "iam:RemoveRoleFromInstanceProfile", |
| 203 | + "iam:DeleteInstanceProfile", |
| 204 | + "iam:GetInstanceProfile", |
| 205 | + "sqs:*" |
| 206 | +
|
| 207 | + ], |
| 208 | + "Resource": [ |
| 209 | + "*" |
| 210 | + ] |
| 211 | + } |
| 212 | + ] |
| 213 | +}' |
| 214 | + |
| 215 | +# Attach the policy to the role |
| 216 | +KARPENTER_POLICY_ARN=$(aws iam list-policies --query "Policies[?PolicyName=='$KARPENTER_POLICY_NAME'].Arn" --output text) |
| 217 | +aws iam attach-role-policy --role-name $KARPENTER_ROLE_NAME --policy-arn $KARPENTER_POLICY_ARN |
| 218 | + |
| 219 | +# Get the ARN of the role |
| 220 | +KARPENTER_ROLE_ARN=$(aws iam get-role --role-name $KARPENTER_ROLE_NAME --query "Role.Arn" --output text) |
| 221 | +aws eks create-pod-identity-association --cluster-name $CLUSTER_NAME --namespace karpenter --service-account karpenter --role-arn $KARPENTER_ROLE_ARN |
| 222 | +aws eks list-pod-identity-associations --cluster-name $CLUSTER_NAME |
| 223 | + |
| 224 | +CLUSTER_ENDPOINT="$(aws eks describe-cluster --name ${CLUSTER_NAME} --query "cluster.endpoint" --output text)" |
| 225 | +echo $CLUSTER_ENDPOINT |
| 226 | +echo $KARPENTER_ROLE_ARN |
| 227 | + |
| 228 | +# Create an SQS queue |
| 229 | +QUEUE_NAME=${CLUSTER_NAME} |
| 230 | +aws sqs create-queue --queue-name $QUEUE_NAME --region $REGION |
| 231 | + |
| 232 | +helm registry logout public.ecr.aws |
| 233 | + |
| 234 | +helm upgrade --install karpenter oci://public.ecr.aws/karpenter/karpenter --version 1.0.0 --namespace karpenter --create-namespace \ |
| 235 | + --set "serviceAccount.annotations.eks\.amazonaws\.com/role-arn=arn:aws:iam::${AWS_ACCOUNT_ID}:role/${KARPENTER_ROLE_NAME}" \ |
| 236 | + --set "settings.clusterName=${CLUSTER_NAME}" \ |
| 237 | + --set "settings.interruptionQueue=${CLUSTER_NAME}" \ |
| 238 | + --set controller.resources.requests.cpu=1 \ |
| 239 | + --set controller.resources.requests.memory=1Gi \ |
| 240 | + --set controller.resources.limits.cpu=1 \ |
| 241 | + --set controller.resources.limits.memory=1Gi \ |
| 242 | + --wait |
| 243 | + |
| 244 | + |
| 245 | + |
| 246 | + |
| 247 | + |
| 248 | +## KubeRay |
| 249 | +# https://github.com/ray-project/kuberay/blob/master/helm-chart/ray-cluster/README.md |
| 250 | +echo "Adding Kubray Operator ..." |
| 251 | +helm repo add kuberay https://ray-project.github.io/kuberay-helm/ |
| 252 | +helm repo update |
| 253 | + |
| 254 | +helm install kuberay-operator kuberay/kuberay-operator --namespace kuberay-system --create-namespace --version 1.2.2 |
| 255 | + |
| 256 | +# Register Node Pool |
| 257 | +# get role of managed worker nodes to be associated to karpenter nodes |
| 258 | +NODE_ROLE_PATTERN=eksctl-${CLUSTER_NAME}-nodegroup-ng- |
| 259 | +NODE_ROLE_NAME=$(aws iam list-roles --query "Roles[?starts_with(RoleName, '${NODE_ROLE_PATTERN}')].RoleName" --output text | head -n 1) |
| 260 | + |
| 261 | + |
| 262 | +export CLUSTER_NAME |
| 263 | +export NODE_ROLE_NAME |
| 264 | + |
| 265 | +update_cluster_name_in_yaml() { |
| 266 | + local filename=$1 |
| 267 | + echo "Changing $filename to update the cluster tag for your cluster $CLUSTER_NAME" |
| 268 | + yq eval 'select(.kind == "EC2NodeClass").spec.subnetSelectorTerms[].tags."eksctl.cluster.k8s.io/v1alpha1/cluster-name" = env(CLUSTER_NAME)' -i "$filename" |
| 269 | + yq eval 'select(.kind == "EC2NodeClass").spec.securityGroupSelectorTerms[].tags."eksctl.cluster.k8s.io/v1alpha1/cluster-name" = env(CLUSTER_NAME)' -i "$filename" |
| 270 | + yq eval 'select(.kind == "EC2NodeClass").spec.role = env(NODE_ROLE_NAME)' -i "$filename" |
| 271 | + |
| 272 | +} |
| 273 | + |
| 274 | +update_cluster_name_in_yaml "../karpenter-pools/karpenter-cpu.yaml" |
| 275 | +update_cluster_name_in_yaml "../karpenter-pools/karpenter-cpu-inference.yaml" |
| 276 | +update_cluster_name_in_yaml "../karpenter-pools/karpenter-cpu-inference-arm.yaml" |
| 277 | + |
| 278 | +echo "Creating Krpenter NodePools..." |
| 279 | +kubectl create -f ../karpenter-pools/karpenter-cpu.yaml |
| 280 | +kubectl create -f ../karpenter-pools/karpenter-cpu-inference.yaml |
| 281 | +kubectl create -f ../karpenter-pools/karpenter-cpu-inference-arm.yaml |
| 282 | + |
| 283 | + |
| 284 | +kubectl config set-context --current --namespace=kuberay-system # set the namespace to kuberay-system |
| 285 | + |
| 286 | +echo "Ready to deploy your Ray Service...." |
| 287 | +####################################### |
| 288 | + |
| 289 | +echo "Adding Grafana and Prometheus" |
| 290 | +# PROM and GRAFANA |
| 291 | +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts |
| 292 | +helm repo add grafana https://grafana.github.io/helm-charts |
| 293 | + |
| 294 | +helm install prometheus prometheus-community/kube-prometheus-stack --namespace monitoring --create-namespace --wait |
| 295 | + |
| 296 | +helm install grafana grafana/grafana --namespace monitoring --create-namespace --wait |
| 297 | + |
| 298 | +kubectl get secret --namespace monitoring grafana -o jsonpath="{.data.admin-password}" | base64 --decode ; echo |
| 299 | + |
| 300 | +kubectl create -f prometheus-monitoring.yaml |
| 301 | + |
| 302 | + |
0 commit comments