Skip to content

Commit 4291816

Browse files
authored
Add production-ready vLLM CoreWeave CKS terraform stack (#834)
Includes: - GPU Bare-Metal node orchestration - Secure Traefik ingress + TLS Endpoints (cert-manager) - Prometheus + Grafana monitoring - Built-in vLLM production stack + Vllm inference dashboards - Terraform + Helm integration Signed-off-by: Kosseila (CloudThrill) <klouddude@gmail.com>
1 parent 3396b0b commit 4291816

22 files changed

+6021
-0
lines changed
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#################################################################
2+
# Default .gitignore content for all terraform-aws-modules below
3+
#################################################################
4+
5+
.DS_Store
6+
7+
# Local .terraform directories
8+
**/.terraform/*
9+
10+
# Terraform lockfile
11+
.terraform.lock.hcl
12+
13+
# .tfstate files
14+
*.tfstate
15+
*.tfstate.*
16+
*.tfplan
17+
18+
# Crash log files
19+
crash.log
20+
21+
# Exclude all .tfvars files, which are likely to contain sentitive data, such as
22+
# password, private keys, and other secrets. These should not be part of version
23+
# control as they are data points which are potentially sensitive and subject
24+
# to change depending on the environment.
25+
*.tfvars
26+
*env-vars
27+
# Ignore override files as they are usually used to override resources locally and so
28+
# are not checked in
29+
override.tf
30+
override.tf.json
31+
*_override.tf
32+
*_override.tf.json
33+
34+
# Ignore CLI configuration files
35+
.terraformrc
36+
terraform.rc

tutorials/terraform/coreweave/README.md

Lines changed: 680 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 226 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,226 @@
1+
###############################################################################
2+
# cluster-tools.tf
3+
# - Traefik (CoreWeave)
4+
# - cert-manager (CoreWeave) + manual ClusterIssuers (prod+staging, Traefik-pinned)
5+
# - metrics-server (upstream)
6+
# - kube-prometheus-stack (upstream) with Grafana ingress + dashboard sidecar
7+
###############################################################################
8+
9+
###############################
10+
# 1) Metrics Server (Upstream)
11+
###############################
12+
resource "helm_release" "metrics_server" {
13+
for_each = var.enable_metrics_server ? toset(["metrics_server"]) : toset([])
14+
name = "metrics-server"
15+
repository = "https://kubernetes-sigs.github.io/metrics-server/"
16+
chart = "metrics-server"
17+
version = "3.12.0"
18+
namespace = "kube-system"
19+
20+
values = [<<-EOT
21+
replicas: 2
22+
podDisruptionBudget:
23+
enabled: true
24+
maxUnavailable: 1
25+
EOT
26+
]
27+
wait = false
28+
timeout = 900
29+
depends_on = [coreweave_cks_cluster.k8s, terraform_data.wait_for_cpu_nodes]
30+
}
31+
32+
#############################
33+
# 2) cert-manager (CoreWeave) - issuers disabled (we manage them manually)
34+
############################
35+
36+
resource "helm_release" "cert_manager" {
37+
for_each = var.enable_cert_manager ? toset(["cert-manager"]) : toset([])
38+
name = "cert-manager"
39+
repository = "https://charts.core-services.ingress.coreweave.com"
40+
chart = "cert-manager"
41+
namespace = "cert-manager"
42+
create_namespace = true
43+
44+
# We keep full control of ClusterIssuers to pin ACME HTTP-01 solver to Traefik.
45+
set = [
46+
{ name = "cert-issuers.enabled", value = "false" }
47+
]
48+
wait = true
49+
timeout = 600
50+
depends_on = [helm_release.traefik]
51+
}
52+
53+
####################################
54+
# 3) ClusterIssuer for Let's Encrypt
55+
####################################
56+
57+
resource "kubectl_manifest" "letsencrypt_issuer-prod" {
58+
for_each = var.enable_cert_manager ? toset(["letsencrypt"]) : toset([])
59+
60+
yaml_body = templatefile(
61+
"${path.module}/config/manifests/letsencrypt-issuer-prod.yaml",
62+
{
63+
letsencrypt_email = var.letsencrypt_email
64+
}
65+
)
66+
67+
depends_on = [
68+
helm_release.cert_manager
69+
]
70+
}
71+
72+
resource "kubectl_manifest" "letsencrypt_issuer-staging" {
73+
for_each = var.enable_cert_manager ? toset(["letsencrypt"]) : toset([])
74+
75+
yaml_body = templatefile(
76+
"${path.module}/config/manifests/letsencrypt-issuer-stage.yaml",
77+
{
78+
letsencrypt_email = var.letsencrypt_email
79+
}
80+
)
81+
82+
depends_on = [
83+
helm_release.cert_manager
84+
]
85+
}
86+
87+
##########################
88+
# 4) Observability Stack
89+
##########################
90+
91+
resource "helm_release" "kube_prometheus_stack" {
92+
for_each = var.enable_monitoring ? toset(["kube_prometheus_stack"]) : toset([])
93+
name = "kube-prometheus-stack"
94+
repository = "https://prometheus-community.github.io/helm-charts"
95+
chart = "kube-prometheus-stack"
96+
namespace = "kube-prometheus-stack" #
97+
version = "75.15.0"
98+
create_namespace = true
99+
100+
values = [
101+
templatefile(
102+
"${path.module}/config/helm/kube-prome-stack.yaml",
103+
{
104+
grafana_admin_password = var.grafana_admin_password
105+
grafana_host = "grafana.${var.org_id}-${var.cluster_name}.coreweave.app"
106+
issuer_name = var.use_letsencrypt_staging ? "letsencrypt-staging" : "letsencrypt-prod"
107+
org_id = var.org_id
108+
cluster_name = var.cluster_name # "vllm-gpu-cluster"
109+
prefix = var.grafana_host_prefix
110+
}
111+
)
112+
]
113+
114+
depends_on = [
115+
helm_release.traefik,
116+
kubectl_manifest.letsencrypt_issuer-prod,
117+
kubectl_manifest.letsencrypt_issuer-staging,
118+
terraform_data.wait_for_cpu_nodes # Ensure CPU nodes ready before deploying
119+
]
120+
}
121+
122+
#################################################################################
123+
# 5) Wait for latest CPU nodepool is ready - terraform_data with local-exec provisioner
124+
################################################################################
125+
126+
resource "terraform_data" "wait_for_cpu_nodes" {
127+
count = var.enable_nodepool_cpu ? 1 : 0
128+
129+
input = sha1(kubectl_manifest.nodepool_cpu["cpu"].yaml_body)
130+
131+
provisioner "local-exec" {
132+
interpreter = ["/bin/bash", "-lc"]
133+
command = <<-EOT
134+
set -euo pipefail
135+
POOL="${var.cpu_nodepool_name}"
136+
TARGET="${var.cpu_node_target}"
137+
KUBECONFIG="${path.module}/kubeconfig"
138+
139+
for _ in $(seq 1 240); do
140+
READY=$(kubectl --kubeconfig "$KUBECONFIG" get nodes \
141+
-l "compute.coreweave.com/node-pool=$POOL" \
142+
--no-headers 2>/dev/null \
143+
| awk '$2=="Ready"{c++} END{print c+0}')
144+
145+
[ "$READY" -ge "$TARGET" ] && exit 0
146+
sleep 10
147+
done
148+
exit 1
149+
EOT
150+
}
151+
152+
depends_on = [
153+
kubectl_manifest.nodepool_cpu,
154+
null_resource.write_kubeconfig
155+
]
156+
}
157+
158+
################################################################################
159+
# 6) GPU Nodepool with kubelet config (taints, scale down strategy, eviction)
160+
################################################################################
161+
# you can use this for more tolerant Ready match: awk '$2 ~ /Ready/ && $2 !~ /NotReady/ {c++} END{print c+0}'
162+
resource "terraform_data" "wait_for_gpu_nodes" {
163+
count = var.enable_nodepool_gpu ? 1 : 0
164+
165+
input = sha1(kubectl_manifest.nodepool_gpu["gpu"].yaml_body)
166+
167+
provisioner "local-exec" {
168+
interpreter = ["/bin/bash", "-lc"]
169+
command = <<-EOT
170+
set -euo pipefail
171+
POOL="${var.gpu_nodepool_name}"
172+
TARGET="${var.gpu_node_target}"
173+
KUBECONFIG="${path.module}/kubeconfig"
174+
175+
for _ in $(seq 1 240); do
176+
READY=$(kubectl --kubeconfig "$KUBECONFIG" get nodes \
177+
-l "compute.coreweave.com/node-pool=$POOL" \
178+
--no-headers 2>/dev/null \
179+
| awk '$2=="Ready"{c++} END{print c+0}')
180+
181+
[ "$READY" -ge "$TARGET" ] && exit 0
182+
sleep 10
183+
done
184+
exit 1
185+
EOT
186+
}
187+
188+
depends_on = [
189+
kubectl_manifest.nodepool_gpu,
190+
null_resource.write_kubeconfig
191+
]
192+
}
193+
194+
195+
#########################################
196+
# Cluster API DNS endpoint ready
197+
#########################################
198+
199+
resource "terraform_data" "wait_for_apiserver_dns" {
200+
depends_on = [null_resource.write_kubeconfig]
201+
202+
provisioner "local-exec" {
203+
interpreter = ["/bin/bash", "-lc"]
204+
command = <<-EOT
205+
set -euo pipefail
206+
KUBECONFIG="${path.module}/kubeconfig"
207+
208+
# Extract server hostname from kubeconfig
209+
HOST="$(kubectl --kubeconfig "$KUBECONFIG" config view --raw -o jsonpath='{.clusters[0].cluster.server}' \
210+
| sed -E 's#^https?://##' | cut -d/ -f1)"
211+
212+
echo "Waiting for API DNS: $HOST"
213+
214+
for _ in $(seq 1 120); do
215+
if getent hosts "$HOST" >/dev/null 2>&1; then
216+
echo "DNS OK: $HOST"
217+
exit 0
218+
fi
219+
sleep 5
220+
done
221+
222+
echo "API DNS did not resolve in time: $HOST" >&2
223+
exit 1
224+
EOT
225+
}
226+
}

0 commit comments

Comments
 (0)