Skip to content

Commit 3c34163

Browse files
authored
feat(gke): add gemma tgi standard cluster (#805)
1 parent cf97fe1 commit 3c34163

File tree

1 file changed

+312
-0
lines changed
  • gke/standard/regional/gemma-tgi

1 file changed

+312
-0
lines changed
Lines changed: 312 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,312 @@
1+
/**
2+
* Copyright 2025 Google LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
# [START gke_standard_regional_gemma_tgi]
18+
data "google_project" "default" {
19+
}
20+
21+
resource "google_container_cluster" "default" {
22+
name = "gke-gemma-tgi"
23+
location = "us-central1"
24+
25+
release_channel {
26+
channel = "RAPID"
27+
}
28+
initial_node_count = 1
29+
workload_identity_config {
30+
workload_pool = "${data.google_project.default.project_id}.svc.id.goog"
31+
}
32+
33+
# Set `deletion_protection` to `true` will ensure that one cannot
34+
# accidentally delete this instance by use of Terraform.
35+
deletion_protection = false
36+
}
37+
38+
resource "google_container_node_pool" "default" {
39+
name = "gke-gemma-tgi"
40+
location = "us-central1"
41+
node_locations = [
42+
"us-central1-a",
43+
]
44+
cluster = google_container_cluster.default.id
45+
46+
initial_node_count = 1
47+
node_config {
48+
machine_type = "g2-standard-8"
49+
guest_accelerator {
50+
type = "nvidia-l4"
51+
count = 1
52+
gpu_driver_installation_config {
53+
gpu_driver_version = "LATEST"
54+
}
55+
}
56+
}
57+
}
58+
59+
data "google_client_config" "default" {}
60+
61+
provider "kubernetes" {
62+
host = "https://${google_container_cluster.default.endpoint}"
63+
token = data.google_client_config.default.access_token
64+
cluster_ca_certificate = base64decode(google_container_cluster.default.master_auth[0].cluster_ca_certificate)
65+
66+
ignore_annotations = [
67+
"^cloud\\.google\\.com\\/neg"
68+
]
69+
}
70+
71+
resource "kubernetes_secret_v1" "default" {
72+
metadata {
73+
name = "seceret-gemma-2b-tgi"
74+
}
75+
76+
data = {
77+
"hf_api_token" = "HF_TOKEN" # Replace with valid Hugging Face Token
78+
}
79+
}
80+
81+
resource "kubernetes_deployment_v1" "default" {
82+
metadata {
83+
name = "tgi-gemma-deployment"
84+
}
85+
86+
spec {
87+
replicas = 1
88+
89+
selector {
90+
match_labels = {
91+
app = "gemma-server"
92+
}
93+
}
94+
95+
template {
96+
metadata {
97+
labels = {
98+
"app" = "gemma-server"
99+
"ai.gke.io/model" = "gemma-2-2b-it"
100+
"ai.gke.io/inference-server" = "text-generation-inference"
101+
"examples.ai.gke.io/source" = "user-guide"
102+
}
103+
}
104+
105+
spec {
106+
container {
107+
name = "inference-server"
108+
image = "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu124.2-3.ubuntu2204.py311"
109+
110+
resources {
111+
requests = {
112+
cpu = "2"
113+
memory = "10Gi"
114+
"ephemeral-storage" = "10Gi"
115+
"nvidia.com/gpu" = "1"
116+
}
117+
118+
limits = {
119+
cpu = "2"
120+
memory = "10Gi"
121+
"ephemeral-storage" = "10Gi"
122+
"nvidia.com/gpu" = "1"
123+
}
124+
}
125+
126+
env {
127+
name = "AIP_HTTP_PORT"
128+
value = "8000"
129+
}
130+
env {
131+
name = "NUM_SHARD"
132+
value = "1"
133+
}
134+
env {
135+
name = "MAX_INPUT_LENGTH"
136+
value = "1562"
137+
}
138+
env {
139+
name = "MAX_TOTAL_TOKENS"
140+
value = "2048"
141+
}
142+
env {
143+
name = "MAX_BATCH_PREFILL_TOKENS"
144+
value = "2048"
145+
}
146+
env {
147+
name = "CUDA_MEMORY_FRACTION"
148+
value = "0.93"
149+
}
150+
env {
151+
name = "MODEL_ID"
152+
value = "google/gemma-2-2b-it"
153+
}
154+
env {
155+
name = "MODEL_ID"
156+
value = "google/gemma-2-2b-it"
157+
}
158+
env {
159+
name = "HUGGING_FACE_HUB_TOKEN"
160+
value_from {
161+
secret_key_ref {
162+
name = kubernetes_secret_v1.default.metadata[0].name
163+
key = "hf_api_token"
164+
}
165+
}
166+
}
167+
168+
volume_mount {
169+
name = "dshm"
170+
mount_path = "/dev/shm"
171+
}
172+
173+
}
174+
175+
volume {
176+
name = "dshm"
177+
empty_dir {
178+
medium = "Memory"
179+
}
180+
}
181+
182+
node_selector = {
183+
"cloud.google.com/gke-accelerator" = "nvidia-l4"
184+
}
185+
}
186+
}
187+
}
188+
}
189+
190+
resource "kubernetes_service_v1" "default" {
191+
metadata {
192+
name = "llm-service"
193+
}
194+
195+
spec {
196+
selector = {
197+
app = kubernetes_deployment_v1.default.spec[0].selector[0].match_labels.app
198+
}
199+
200+
port {
201+
protocol = "TCP"
202+
port = 8000
203+
target_port = 8000
204+
}
205+
206+
type = "ClusterIP"
207+
}
208+
209+
depends_on = [time_sleep.wait_service_cleanup]
210+
}
211+
212+
# Provide time for Service cleanup
213+
resource "time_sleep" "wait_service_cleanup" {
214+
depends_on = [google_container_cluster.default]
215+
216+
destroy_duration = "180s"
217+
}
218+
# [END gke_standard_regional_gemma_tgi]
219+
220+
# [START gke_standard_regional_gemma_tgi_gradio]
221+
resource "kubernetes_deployment_v1" "gradio" {
222+
metadata {
223+
name = "gradio"
224+
labels = {
225+
"app" = "gradio"
226+
}
227+
}
228+
229+
spec {
230+
replicas = 1
231+
selector {
232+
match_labels = {
233+
app = "gradio"
234+
}
235+
}
236+
template {
237+
metadata {
238+
labels = {
239+
app = "gradio"
240+
}
241+
}
242+
spec {
243+
container {
244+
name = "gradio"
245+
image = "us-docker.pkg.dev/google-samples/containers/gke/gradio-app:v1.0.4"
246+
resources {
247+
requests = {
248+
cpu = "250m"
249+
memory = "512Mi"
250+
}
251+
252+
limits = {
253+
cpu = "500m"
254+
memory = "512Mi"
255+
}
256+
}
257+
258+
env {
259+
name = "CONTEXT_PATH"
260+
value = "/generate"
261+
}
262+
env {
263+
name = "HOST"
264+
value = "http://llm-service:8000"
265+
}
266+
env {
267+
name = "LLM_ENGINE"
268+
value = "tgi"
269+
}
270+
env {
271+
name = "MODEL_ID"
272+
value = "gemma"
273+
}
274+
env {
275+
name = "USER_PROMPT"
276+
value = "<start_of_turn>user\\nprompt<end_of_turn>\\n"
277+
}
278+
env {
279+
name = "SYSTEM_PROMPT"
280+
value = "<start_of_turn>model\\nprompt<end_of_turn>\\n"
281+
}
282+
port {
283+
container_port = 7860
284+
}
285+
}
286+
}
287+
}
288+
}
289+
}
290+
291+
resource "kubernetes_service_v1" "gradio" {
292+
metadata {
293+
name = "gradio"
294+
}
295+
296+
spec {
297+
selector = {
298+
app = kubernetes_deployment_v1.gradio.spec[0].selector[0].match_labels.app
299+
}
300+
301+
port {
302+
protocol = "TCP"
303+
port = 8080
304+
target_port = 7860
305+
}
306+
307+
type = "ClusterIP"
308+
}
309+
310+
depends_on = [time_sleep.wait_service_cleanup]
311+
}
312+
# [END gke_standard_regional_gemma_tgi_gradio]

0 commit comments

Comments
 (0)