|
| 1 | +/** |
| 2 | +* Copyright 2025 Google LLC |
| 3 | +* |
| 4 | +* Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | +* you may not use this file except in compliance with the License. |
| 6 | +* You may obtain a copy of the License at |
| 7 | +* |
| 8 | +* http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | +* |
| 10 | +* Unless required by applicable law or agreed to in writing, software |
| 11 | +* distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | +* See the License for the specific language governing permissions and |
| 14 | +* limitations under the License. |
| 15 | +*/ |
| 16 | + |
| 17 | +# [START gke_standard_regional_gemma_tgi] |
| 18 | +data "google_project" "default" { |
| 19 | +} |
| 20 | + |
| 21 | +resource "google_container_cluster" "default" { |
| 22 | + name = "gke-gemma-tgi" |
| 23 | + location = "us-central1" |
| 24 | + |
| 25 | + release_channel { |
| 26 | + channel = "RAPID" |
| 27 | + } |
| 28 | + initial_node_count = 1 |
| 29 | + workload_identity_config { |
| 30 | + workload_pool = "${data.google_project.default.project_id}.svc.id.goog" |
| 31 | + } |
| 32 | + |
| 33 | + # Set `deletion_protection` to `true` will ensure that one cannot |
| 34 | + # accidentally delete this instance by use of Terraform. |
| 35 | + deletion_protection = false |
| 36 | +} |
| 37 | + |
| 38 | +resource "google_container_node_pool" "default" { |
| 39 | + name = "gke-gemma-tgi" |
| 40 | + location = "us-central1" |
| 41 | + node_locations = [ |
| 42 | + "us-central1-a", |
| 43 | + ] |
| 44 | + cluster = google_container_cluster.default.id |
| 45 | + |
| 46 | + initial_node_count = 1 |
| 47 | + node_config { |
| 48 | + machine_type = "g2-standard-8" |
| 49 | + guest_accelerator { |
| 50 | + type = "nvidia-l4" |
| 51 | + count = 1 |
| 52 | + gpu_driver_installation_config { |
| 53 | + gpu_driver_version = "LATEST" |
| 54 | + } |
| 55 | + } |
| 56 | + } |
| 57 | +} |
| 58 | + |
| 59 | +data "google_client_config" "default" {} |
| 60 | + |
| 61 | +provider "kubernetes" { |
| 62 | + host = "https://${google_container_cluster.default.endpoint}" |
| 63 | + token = data.google_client_config.default.access_token |
| 64 | + cluster_ca_certificate = base64decode(google_container_cluster.default.master_auth[0].cluster_ca_certificate) |
| 65 | + |
| 66 | + ignore_annotations = [ |
| 67 | + "^cloud\\.google\\.com\\/neg" |
| 68 | + ] |
| 69 | +} |
| 70 | + |
| 71 | +resource "kubernetes_secret_v1" "default" { |
| 72 | + metadata { |
| 73 | + name = "seceret-gemma-2b-tgi" |
| 74 | + } |
| 75 | + |
| 76 | + data = { |
| 77 | + "hf_api_token" = "HF_TOKEN" # Replace with valid Hugging Face Token |
| 78 | + } |
| 79 | +} |
| 80 | + |
| 81 | +resource "kubernetes_deployment_v1" "default" { |
| 82 | + metadata { |
| 83 | + name = "tgi-gemma-deployment" |
| 84 | + } |
| 85 | + |
| 86 | + spec { |
| 87 | + replicas = 1 |
| 88 | + |
| 89 | + selector { |
| 90 | + match_labels = { |
| 91 | + app = "gemma-server" |
| 92 | + } |
| 93 | + } |
| 94 | + |
| 95 | + template { |
| 96 | + metadata { |
| 97 | + labels = { |
| 98 | + "app" = "gemma-server" |
| 99 | + "ai.gke.io/model" = "gemma-2-2b-it" |
| 100 | + "ai.gke.io/inference-server" = "text-generation-inference" |
| 101 | + "examples.ai.gke.io/source" = "user-guide" |
| 102 | + } |
| 103 | + } |
| 104 | + |
| 105 | + spec { |
| 106 | + container { |
| 107 | + name = "inference-server" |
| 108 | + image = "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu124.2-3.ubuntu2204.py311" |
| 109 | + |
| 110 | + resources { |
| 111 | + requests = { |
| 112 | + cpu = "2" |
| 113 | + memory = "10Gi" |
| 114 | + "ephemeral-storage" = "10Gi" |
| 115 | + "nvidia.com/gpu" = "1" |
| 116 | + } |
| 117 | + |
| 118 | + limits = { |
| 119 | + cpu = "2" |
| 120 | + memory = "10Gi" |
| 121 | + "ephemeral-storage" = "10Gi" |
| 122 | + "nvidia.com/gpu" = "1" |
| 123 | + } |
| 124 | + } |
| 125 | + |
| 126 | + env { |
| 127 | + name = "AIP_HTTP_PORT" |
| 128 | + value = "8000" |
| 129 | + } |
| 130 | + env { |
| 131 | + name = "NUM_SHARD" |
| 132 | + value = "1" |
| 133 | + } |
| 134 | + env { |
| 135 | + name = "MAX_INPUT_LENGTH" |
| 136 | + value = "1562" |
| 137 | + } |
| 138 | + env { |
| 139 | + name = "MAX_TOTAL_TOKENS" |
| 140 | + value = "2048" |
| 141 | + } |
| 142 | + env { |
| 143 | + name = "MAX_BATCH_PREFILL_TOKENS" |
| 144 | + value = "2048" |
| 145 | + } |
| 146 | + env { |
| 147 | + name = "CUDA_MEMORY_FRACTION" |
| 148 | + value = "0.93" |
| 149 | + } |
| 150 | + env { |
| 151 | + name = "MODEL_ID" |
| 152 | + value = "google/gemma-2-2b-it" |
| 153 | + } |
| 154 | + env { |
| 155 | + name = "MODEL_ID" |
| 156 | + value = "google/gemma-2-2b-it" |
| 157 | + } |
| 158 | + env { |
| 159 | + name = "HUGGING_FACE_HUB_TOKEN" |
| 160 | + value_from { |
| 161 | + secret_key_ref { |
| 162 | + name = kubernetes_secret_v1.default.metadata[0].name |
| 163 | + key = "hf_api_token" |
| 164 | + } |
| 165 | + } |
| 166 | + } |
| 167 | + |
| 168 | + volume_mount { |
| 169 | + name = "dshm" |
| 170 | + mount_path = "/dev/shm" |
| 171 | + } |
| 172 | + |
| 173 | + } |
| 174 | + |
| 175 | + volume { |
| 176 | + name = "dshm" |
| 177 | + empty_dir { |
| 178 | + medium = "Memory" |
| 179 | + } |
| 180 | + } |
| 181 | + |
| 182 | + node_selector = { |
| 183 | + "cloud.google.com/gke-accelerator" = "nvidia-l4" |
| 184 | + } |
| 185 | + } |
| 186 | + } |
| 187 | + } |
| 188 | +} |
| 189 | + |
| 190 | +resource "kubernetes_service_v1" "default" { |
| 191 | + metadata { |
| 192 | + name = "llm-service" |
| 193 | + } |
| 194 | + |
| 195 | + spec { |
| 196 | + selector = { |
| 197 | + app = kubernetes_deployment_v1.default.spec[0].selector[0].match_labels.app |
| 198 | + } |
| 199 | + |
| 200 | + port { |
| 201 | + protocol = "TCP" |
| 202 | + port = 8000 |
| 203 | + target_port = 8000 |
| 204 | + } |
| 205 | + |
| 206 | + type = "ClusterIP" |
| 207 | + } |
| 208 | + |
| 209 | + depends_on = [time_sleep.wait_service_cleanup] |
| 210 | +} |
| 211 | + |
| 212 | +# Provide time for Service cleanup |
| 213 | +resource "time_sleep" "wait_service_cleanup" { |
| 214 | + depends_on = [google_container_cluster.default] |
| 215 | + |
| 216 | + destroy_duration = "180s" |
| 217 | +} |
| 218 | +# [END gke_standard_regional_gemma_tgi] |
| 219 | + |
| 220 | +# [START gke_standard_regional_gemma_tgi_gradio] |
| 221 | +resource "kubernetes_deployment_v1" "gradio" { |
| 222 | + metadata { |
| 223 | + name = "gradio" |
| 224 | + labels = { |
| 225 | + "app" = "gradio" |
| 226 | + } |
| 227 | + } |
| 228 | + |
| 229 | + spec { |
| 230 | + replicas = 1 |
| 231 | + selector { |
| 232 | + match_labels = { |
| 233 | + app = "gradio" |
| 234 | + } |
| 235 | + } |
| 236 | + template { |
| 237 | + metadata { |
| 238 | + labels = { |
| 239 | + app = "gradio" |
| 240 | + } |
| 241 | + } |
| 242 | + spec { |
| 243 | + container { |
| 244 | + name = "gradio" |
| 245 | + image = "us-docker.pkg.dev/google-samples/containers/gke/gradio-app:v1.0.4" |
| 246 | + resources { |
| 247 | + requests = { |
| 248 | + cpu = "250m" |
| 249 | + memory = "512Mi" |
| 250 | + } |
| 251 | + |
| 252 | + limits = { |
| 253 | + cpu = "500m" |
| 254 | + memory = "512Mi" |
| 255 | + } |
| 256 | + } |
| 257 | + |
| 258 | + env { |
| 259 | + name = "CONTEXT_PATH" |
| 260 | + value = "/generate" |
| 261 | + } |
| 262 | + env { |
| 263 | + name = "HOST" |
| 264 | + value = "http://llm-service:8000" |
| 265 | + } |
| 266 | + env { |
| 267 | + name = "LLM_ENGINE" |
| 268 | + value = "tgi" |
| 269 | + } |
| 270 | + env { |
| 271 | + name = "MODEL_ID" |
| 272 | + value = "gemma" |
| 273 | + } |
| 274 | + env { |
| 275 | + name = "USER_PROMPT" |
| 276 | + value = "<start_of_turn>user\\nprompt<end_of_turn>\\n" |
| 277 | + } |
| 278 | + env { |
| 279 | + name = "SYSTEM_PROMPT" |
| 280 | + value = "<start_of_turn>model\\nprompt<end_of_turn>\\n" |
| 281 | + } |
| 282 | + port { |
| 283 | + container_port = 7860 |
| 284 | + } |
| 285 | + } |
| 286 | + } |
| 287 | + } |
| 288 | + } |
| 289 | +} |
| 290 | + |
| 291 | +resource "kubernetes_service_v1" "gradio" { |
| 292 | + metadata { |
| 293 | + name = "gradio" |
| 294 | + } |
| 295 | + |
| 296 | + spec { |
| 297 | + selector = { |
| 298 | + app = kubernetes_deployment_v1.gradio.spec[0].selector[0].match_labels.app |
| 299 | + } |
| 300 | + |
| 301 | + port { |
| 302 | + protocol = "TCP" |
| 303 | + port = 8080 |
| 304 | + target_port = 7860 |
| 305 | + } |
| 306 | + |
| 307 | + type = "ClusterIP" |
| 308 | + } |
| 309 | + |
| 310 | + depends_on = [time_sleep.wait_service_cleanup] |
| 311 | +} |
| 312 | +# [END gke_standard_regional_gemma_tgi_gradio] |
0 commit comments