Skip to content

Commit d134211

Browse files
committed
solutions deployed
1 parent 625aa2e commit d134211

19 files changed

+1705
-10
lines changed

.DS_Store

6 KB
Binary file not shown.

README.md

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,21 @@
1-
## My Project
1+
# EKS-LLM
22

3-
TODO: Fill this README out!
43

5-
Be sure to:
64

7-
* Change the title in this README
8-
* Edit your repository description on GitHub
5+
## Getting started
96

10-
## Security
7+
The main goal for this repo is to make it easier for the audience to test and validate multiple LLMs with different runtime engines on EKS.
8+
This repo aims to provide the following.
9+
1. A simple script to deploy EKS with Karpenter NodePools for CPU, Kuberay and OSS observability stack. This script is available at base_eks_setup folder. Make sure you have authenticated with aws cli before running this. The script will provision a cluster in the Sydney region, install Grafna/Prometheus and KubeRay to serve models.
10+
2. Ray Servers to serve LLama.cpp (CPU) and VLLM models (GPU). The ray-server folder contains the python code that enables RayServe to serve the model usin gLLaMa.cpp engine.
11+
3. Ray Services to deploy clusters on EKS. Ray cluster configuration is available at ray-services folder.
12+
4. Once you deploy your Ray clutser, you can provision load balancer via ra-services/ingress folder.
13+
5. Performace scripts to capture and select what works for you. The docketfiles/benchmark folder a GO program to hit the deploymed model over HTTP
1114

12-
See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information.
15+
>> *Warning
16+
>> Make sure to change the HF ID in the ray-services/ray-service-vllm-llama-3.2-CPU-LLAMA.yaml and ray-service-vllm-llama-3.2-CPU-LLAMA-arm.yaml files
1317
14-
## License
15-
16-
This library is licensed under the MIT-0 License. See the LICENSE file.
18+
Please refer to the blog XXX that utilises thses scripts for a measurements of Intel vs Graviton.
1719

20+
## Contact
21+
Please contact wangaws@ or fmamazon@ if you want to know more and/or contribute.
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
---
2+
apiVersion: monitoring.coreos.com/v1
3+
kind: ServiceMonitor
4+
metadata:
5+
name: vllm
6+
namespace: monitoring
7+
labels:
8+
release: kube-prometheus-stack
9+
spec:
10+
endpoints:
11+
- path: '/metrics/'
12+
port: metrics
13+
selector:
14+
matchLabels:
15+
app.kubernetes.io/name: kuberay
16+
---
17+
apiVersion: monitoring.coreos.com/v1
18+
kind: PodMonitor
19+
metadata:
20+
labels:
21+
release: prometheus
22+
name: kuberay-cluster
23+
namespace: monitoring # ns where prometheus is deployed
24+
spec:
25+
podMetricsEndpoints:
26+
- port: metrics
27+
path: '/metrics/'
28+
namespaceSelector:
29+
matchNames:
30+
- kuberay-system # ns where Ray cluster is deployed
31+
selector:
32+
matchLabels:
33+
app.kubernetes.io/name: kuberay
34+
---
35+
apiVersion: monitoring.coreos.com/v1
36+
kind: PodMonitor
37+
metadata:
38+
name: ray-workers-monitor
39+
namespace: monitoring
40+
labels:
41+
# `release: $HELM_RELEASE`: Prometheus can only detect PodMonitor with this label.
42+
release: prometheus
43+
spec:
44+
jobLabel: ray-workers
45+
# Only select Kubernetes Pods in the "default" namespace.
46+
namespaceSelector:
47+
matchNames:
48+
- kuberay-system
49+
# Only select Kubernetes Pods with "matchLabels".
50+
selector:
51+
matchLabels:
52+
ray.io/node-type: worker
53+
# A list of endpoints allowed as part of this PodMonitor.
54+
podMetricsEndpoints:
55+
- port: metrics
56+
path: '/metrics/'
57+
relabelings:
58+
- sourceLabels: [__meta_kubernetes_pod_label_ray_io_cluster]
59+
targetLabel: ray_io_cluster

base_eks_setup/provision-v2.sh

Lines changed: 302 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,302 @@
1+
#!/bin/bash
2+
3+
4+
# This script provision a minimal, non-production ready, EKS cluster with two nodes and karpenter configuration. The script install karpenter and Kuberay operator on the cluster.
5+
6+
7+
# Check if the script has passed CLUSTER_NAME and REGION as arguments
8+
# if [[ $# -ne 2 ]]; then
9+
# echo "Usage: $0 <param1> <param2>"
10+
# exit 1
11+
# fi
12+
13+
14+
15+
REGION=us-east-1
16+
CLUSTER_NAME=llm-eks-cluster
17+
18+
19+
# Check if jq is installed
20+
if ! command -v jq &> /dev/null; then
21+
echo "jq is not installed. Please install jq and try again."
22+
exit 1
23+
fi
24+
25+
# Check if yq is installed
26+
if ! command -v yq &> /dev/null; then
27+
echo "yq is not installed. Please install yq and try again."
28+
exit 1
29+
fi
30+
31+
if ! command -v eksctl &> /dev/null; then
32+
echo "eksctl is not installed. Please install eksctl and eksctl again."
33+
exit 1
34+
fi
35+
36+
37+
# Check if cluster exists and delete if it does
38+
if aws eks describe-cluster --name "${CLUSTER_NAME}" --region "${REGION}" >/dev/null 2>&1; then
39+
echo "Found existing cluster ${CLUSTER_NAME} in region ${REGION}, proceeding with deletion..."
40+
eksctl delete cluster --name "${CLUSTER_NAME}" --region "${REGION}" --wait
41+
echo "Cluster deletion completed"
42+
else
43+
echo "No existing cluster named ${CLUSTER_NAME} in region ${REGION}, skipping deletion"
44+
fi
45+
46+
# Check if cluster exists and delete if it does
47+
if aws eks describe-cluster --name "${CLUSTER_NAME}" --region "${REGION}" >/dev/null 2>&1; then
48+
echo "Found existing cluster ${CLUSTER_NAME} in region ${REGION}, proceeding with deletion..."
49+
50+
# Delete the CloudFormation stack directly
51+
STACK_NAME="eksctl-${CLUSTER_NAME}-cluster"
52+
echo "Deleting CloudFormation stack: ${STACK_NAME}"
53+
aws cloudformation delete-stack --stack-name "${STACK_NAME}"
54+
echo "Waiting for stack deletion to complete..."
55+
aws cloudformation wait stack-delete-complete --stack-name "${STACK_NAME}"
56+
echo "Stack deletion completed"
57+
else
58+
echo "No existing cluster named ${CLUSTER_NAME} in region ${REGION}, skipping deletion"
59+
fi
60+
61+
62+
63+
ALLOWED_VPC=$(aws service-quotas get-service-quota --service-code vpc --quota-code L-F678F1CE | jq '.Quota.Value | floor')
64+
CONSUMED_VPC=$(aws ec2 describe-vpcs --query "Vpcs" --output json | jq '. | length')
65+
if [[ $CONSUMED_VPC -ge $ALLOWED_VPC ]]; then
66+
echo "You have reached the limit of VPCs in your account. This script will need to create a new VPC! Exiting ..."
67+
exit 1
68+
else
69+
echo "You have $CONSUMED_VPC VPCs in your account. You can create $((ALLOWED_VPC - CONSUMED_VPC)) more VPCs."
70+
fi
71+
72+
# get user ARN and store it in a variable
73+
# IAM_PRINCIPAL_ARN=$(aws sts get-caller-identity --query "Arn" --output text)
74+
IAM_ACCOUNT_ID=$(aws sts get-caller-identity --query "Account" --output text)
75+
IAM_PRINCIPAL_ARN=arn:aws:sts::${IAM_ACCOUNT_ID}:assumed-role/Admin/{{SessionName}}
76+
IAM_PRINCIPAL_ROLE_ARN=arn:aws:iam::${IAM_ACCOUNT_ID}:role/Admin
77+
78+
# validate and fail if there is no user ARN
79+
if [[ -z $IAM_PRINCIPAL_ARN ]]; then
80+
echo "Failed to get the user ARN. Make sure that you are logged in to the AWS CLI with the correct user."
81+
exit 1
82+
fi
83+
84+
85+
# create cluster using eksctl
86+
87+
eksctl create cluster --name=$CLUSTER_NAME --version 1.31 --nodes=2 --region=$REGION --managed --auto-kubeconfig
88+
89+
# add addons
90+
# Check and install VPC-CNI
91+
if ! aws eks describe-addon --cluster-name $CLUSTER_NAME --addon-name vpc-cni >/dev/null 2>&1; then
92+
echo "Installing vpc-cni addon..."
93+
aws eks create-addon --cluster-name $CLUSTER_NAME --addon-name vpc-cni
94+
else
95+
echo "vpc-cni addon already exists"
96+
fi
97+
aws eks describe-addon --addon-name vpc-cni --cluster-name $CLUSTER_NAME
98+
99+
# Check and install CoreDNS
100+
if ! aws eks describe-addon --cluster-name $CLUSTER_NAME --addon-name coredns >/dev/null 2>&1; then
101+
echo "Installing coredns addon..."
102+
aws eks create-addon --cluster-name $CLUSTER_NAME --addon-name coredns
103+
else
104+
echo "coredns addon already exists"
105+
fi
106+
aws eks describe-addon --addon-name coredns --cluster-name $CLUSTER_NAME
107+
108+
# Check and install Pod Identity Agent
109+
if ! aws eks describe-addon --cluster-name $CLUSTER_NAME --addon-name eks-pod-identity-agent >/dev/null 2>&1; then
110+
echo "Installing eks-pod-identity-agent addon..."
111+
aws eks create-addon --cluster-name $CLUSTER_NAME --addon-name eks-pod-identity-agent
112+
else
113+
echo "eks-pod-identity-agent addon already exists"
114+
fi
115+
aws eks describe-addon --addon-name eks-pod-identity-agent --cluster-name $CLUSTER_NAME
116+
117+
118+
119+
# list access policies
120+
aws eks list-access-policies
121+
122+
aws eks update-cluster-config --name $CLUSTER_NAME --access-config authenticationMode=API
123+
124+
125+
# Add an access entry to allow current AWS user to be ClusterAdmin access policy
126+
#aws eks create-access-entry --cluster-name $CLUSTER_NAME --principal-arn $IAM_PRINCIPAL_ROLE_ARN
127+
#aws eks associate-access-policy --cluster-name $CLUSTER_NAME --principal-arn $IAM_PRINCIPAL_ARN --policy-arn arn:aws:eks::aws:cluster-access-policy/AmazonEKSClusterAdminPolicy --access-scope type=cluster
128+
129+
130+
aws eks update-kubeconfig --name $CLUSTER_NAME --region $REGION
131+
kubectl -n kube-system get pods
132+
133+
#install cert manager
134+
echo "Adding Cert Manager ..."
135+
kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.15.3/cert-manager.yaml
136+
137+
echo "Base EKS Setup completed"
138+
139+
140+
141+
142+
# deploy Karpenter on EKS
143+
echo "Adding Karpenter ..."
144+
145+
export KARPENTER_NAMESPACE="kube-system"
146+
export KARPENTER_VERSION="1.0.3"
147+
export K8S_VERSION="1.31"
148+
export AWS_PARTITION="aws" # if you are not using standard partitions, you may need to configure to aws-cn / aws-us-gov
149+
export AWS_DEFAULT_REGION=${REGION}
150+
export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)"
151+
export TEMPOUT="$(mktemp)"
152+
153+
154+
KARPENTER_ROLE_NAME="FMKarpenterRole"
155+
KARPENTER_POLICY_NAME="FMKARPENTERPolicy"
156+
157+
export KARPENTER_ROLE_NAME
158+
159+
# Create the IAM role
160+
aws iam create-role --role-name $KARPENTER_ROLE_NAME --assume-role-policy-document '{
161+
"Version": "2012-10-17",
162+
"Statement": [{
163+
"Effect": "Allow",
164+
"Principal": {
165+
"Service": "pods.eks.amazonaws.com"
166+
},
167+
"Action": ["sts:AssumeRole","sts:TagSession"]
168+
}]
169+
}'
170+
171+
# Create the policy
172+
# https://karpenter.sh/docs/getting-started/migrating-from-cas/
173+
# this is useful too
174+
aws iam create-policy --policy-name $KARPENTER_POLICY_NAME --policy-document '{
175+
"Version": "2012-10-17",
176+
"Statement": [
177+
{
178+
"Effect": "Allow",
179+
"Action": [
180+
"ssm:GetParameter",
181+
"ec2:DescribeImages",
182+
"ec2:RunInstances",
183+
"ec2:DescribeSubnets",
184+
"ec2:DescribeSecurityGroups",
185+
"ec2:DescribeLaunchTemplates",
186+
"ec2:DescribeInstances",
187+
"ec2:DescribeInstanceTypes",
188+
"ec2:DescribeInstanceTypeOfferings",
189+
"ec2:DeleteLaunchTemplate",
190+
"ec2:CreateTags",
191+
"ec2:CreateLaunchTemplate",
192+
"ec2:CreateFleet",
193+
"ec2:DescribeSpotPriceHistory",
194+
"pricing:GetProducts",
195+
"ec2:TerminateInstances",
196+
"ec2:CreateTags",
197+
"iam:PassRole",
198+
"eks:DescribeCluster",
199+
"iam:CreateInstanceProfile",
200+
"iam:TagInstanceProfile",
201+
"iam:AddRoleToInstanceProfile",
202+
"iam:RemoveRoleFromInstanceProfile",
203+
"iam:DeleteInstanceProfile",
204+
"iam:GetInstanceProfile",
205+
"sqs:*"
206+
207+
],
208+
"Resource": [
209+
"*"
210+
]
211+
}
212+
]
213+
}'
214+
215+
# Attach the policy to the role
216+
KARPENTER_POLICY_ARN=$(aws iam list-policies --query "Policies[?PolicyName=='$KARPENTER_POLICY_NAME'].Arn" --output text)
217+
aws iam attach-role-policy --role-name $KARPENTER_ROLE_NAME --policy-arn $KARPENTER_POLICY_ARN
218+
219+
# Get the ARN of the role
220+
KARPENTER_ROLE_ARN=$(aws iam get-role --role-name $KARPENTER_ROLE_NAME --query "Role.Arn" --output text)
221+
aws eks create-pod-identity-association --cluster-name $CLUSTER_NAME --namespace karpenter --service-account karpenter --role-arn $KARPENTER_ROLE_ARN
222+
aws eks list-pod-identity-associations --cluster-name $CLUSTER_NAME
223+
224+
CLUSTER_ENDPOINT="$(aws eks describe-cluster --name ${CLUSTER_NAME} --query "cluster.endpoint" --output text)"
225+
echo $CLUSTER_ENDPOINT
226+
echo $KARPENTER_ROLE_ARN
227+
228+
# Create an SQS queue
229+
QUEUE_NAME=${CLUSTER_NAME}
230+
aws sqs create-queue --queue-name $QUEUE_NAME --region $REGION
231+
232+
helm registry logout public.ecr.aws
233+
234+
helm upgrade --install karpenter oci://public.ecr.aws/karpenter/karpenter --version 1.0.0 --namespace karpenter --create-namespace \
235+
--set "serviceAccount.annotations.eks\.amazonaws\.com/role-arn=arn:aws:iam::${AWS_ACCOUNT_ID}:role/${KARPENTER_ROLE_NAME}" \
236+
--set "settings.clusterName=${CLUSTER_NAME}" \
237+
--set "settings.interruptionQueue=${CLUSTER_NAME}" \
238+
--set controller.resources.requests.cpu=1 \
239+
--set controller.resources.requests.memory=1Gi \
240+
--set controller.resources.limits.cpu=1 \
241+
--set controller.resources.limits.memory=1Gi \
242+
--wait
243+
244+
245+
246+
247+
248+
## KubeRay
249+
# https://github.com/ray-project/kuberay/blob/master/helm-chart/ray-cluster/README.md
250+
echo "Adding Kubray Operator ..."
251+
helm repo add kuberay https://ray-project.github.io/kuberay-helm/
252+
helm repo update
253+
254+
helm install kuberay-operator kuberay/kuberay-operator --namespace kuberay-system --create-namespace --version 1.2.2
255+
256+
# Register Node Pool
257+
# get role of managed worker nodes to be associated to karpenter nodes
258+
NODE_ROLE_PATTERN=eksctl-${CLUSTER_NAME}-nodegroup-ng-
259+
NODE_ROLE_NAME=$(aws iam list-roles --query "Roles[?starts_with(RoleName, '${NODE_ROLE_PATTERN}')].RoleName" --output text | head -n 1)
260+
261+
262+
export CLUSTER_NAME
263+
export NODE_ROLE_NAME
264+
265+
update_cluster_name_in_yaml() {
266+
local filename=$1
267+
echo "Changing $filename to update the cluster tag for your cluster $CLUSTER_NAME"
268+
yq eval 'select(.kind == "EC2NodeClass").spec.subnetSelectorTerms[].tags."eksctl.cluster.k8s.io/v1alpha1/cluster-name" = env(CLUSTER_NAME)' -i "$filename"
269+
yq eval 'select(.kind == "EC2NodeClass").spec.securityGroupSelectorTerms[].tags."eksctl.cluster.k8s.io/v1alpha1/cluster-name" = env(CLUSTER_NAME)' -i "$filename"
270+
yq eval 'select(.kind == "EC2NodeClass").spec.role = env(NODE_ROLE_NAME)' -i "$filename"
271+
272+
}
273+
274+
update_cluster_name_in_yaml "../karpenter-pools/karpenter-cpu.yaml"
275+
update_cluster_name_in_yaml "../karpenter-pools/karpenter-cpu-inference.yaml"
276+
update_cluster_name_in_yaml "../karpenter-pools/karpenter-cpu-inference-arm.yaml"
277+
278+
echo "Creating Krpenter NodePools..."
279+
kubectl create -f ../karpenter-pools/karpenter-cpu.yaml
280+
kubectl create -f ../karpenter-pools/karpenter-cpu-inference.yaml
281+
kubectl create -f ../karpenter-pools/karpenter-cpu-inference-arm.yaml
282+
283+
284+
kubectl config set-context --current --namespace=kuberay-system # set the namespace to kuberay-system
285+
286+
echo "Ready to deploy your Ray Service...."
287+
#######################################
288+
289+
echo "Adding Grafana and Prometheus"
290+
# PROM and GRAFANA
291+
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
292+
helm repo add grafana https://grafana.github.io/helm-charts
293+
294+
helm install prometheus prometheus-community/kube-prometheus-stack --namespace monitoring --create-namespace --wait
295+
296+
helm install grafana grafana/grafana --namespace monitoring --create-namespace --wait
297+
298+
kubectl get secret --namespace monitoring grafana -o jsonpath="{.data.admin-password}" | base64 --decode ; echo
299+
300+
kubectl create -f prometheus-monitoring.yaml
301+
302+

0 commit comments

Comments
 (0)