AI-Hypercomputer · bwuu · Mar 4, 2026
diff --git a/inference/trillium/vLLM/Qwen3-Coder-480B/Chart.yaml b/inference/trillium/vLLM/Qwen3-Coder-480B/Chart.yaml
@@ -0,0 +1,6 @@
+apiVersion: v2
+name: tpu7x-benchmark
+description: tpu7x-benchmark
+type: application
+version: 0.1.0
+appVersion: "1.16.0"
diff --git a/inference/trillium/vLLM/Qwen3-Coder-480B/README.md b/inference/trillium/vLLM/Qwen3-Coder-480B/README.md
@@ -0,0 +1,93 @@
+
+# Run vLLM qwen3_coder_480b_a35b_instruct on tpu7x on GKE
+
+This recipe covers running a vLLM inference workload on tpu7x on
+GKE.
+
+## Create the GKE Cluster
+Create your tpu7x cluster using [XPK](https://github.com/AI-Hypercomputer/xpk).
+The next sections assume you have created a cluster with tpu7x
+nodes.
+
+## Deploy vLLM Workload on GKE
+
+### Configure kubectl to communicate with your cluster
+
+```
+gcloud container clusters get-credentials ${CLUSTER_NAME} --location=${LOCATION}
+```
+
+### Generate a new Hugging Face token if you don't already have one
+
+On the huggingface website, create an account if necessary, and go to Your
+Profile > Settings > Access Tokens.
+Select Create new token.
+Specify a name of your choice and a role with at least Read permissions.
+Select Generate a token, follow the prompts and save your token.
+
+(NOTE: Also ensure that your account has access to the model on Hugging Face.
+For example, for llama3, you will need to explicitly get permission):
+
+### Run the benchmark
+In this directory, run:
+
+```
+helm install ${RUN_NAME} . --set hf_token=${HF_TOKEN}
+```
+
+The benchmark will launch a client and server pod.
+
+On the server pod, at the end of the server startup you’ll see logs such as:
+
+```
+$ kubectl logs deployment/vllm-tpu -f
+
+(APIServer pid=1) INFO:     Started server process [1]
+(APIServer pid=1) INFO:     Waiting for application startup.
+(APIServer pid=1) INFO:     Application startup complete.
+```
+
+The client pod will wait until the server is up and then start the benchmark.
+On the client pod, you'll see logs such as:
+
+```
+$ kubectl logs -f vllm-bench
+
+============ Serving Benchmark Result ============
+Successful requests:                     10
+Failed requests:                         0
+Benchmark duration (s):                  xx
+Total input tokens:                      xxx
+Total generated tokens:                  xxx
+Request throughput (req/s):              xx
+Output token throughput (tok/s):         xxx
+Peak output token throughput (tok/s):    xxx
+Peak concurrent requests:                10.00
+Total Token throughput (tok/s):          xxx
+---------------Time to First Token----------------
+Mean TTFT (ms):                          xxx
+Median TTFT (ms):                        xxx
+P99 TTFT (ms):                           xxx
+-----Time per Output Token (excl. 1st token)------
+Mean TPOT (ms):                          xxx
+Median TPOT (ms):                       xxx
+P99 TPOT (ms):                           xxx
+---------------Inter-token Latency----------------
+Mean ITL (ms):                           xxx
+Median ITL (ms):                         xxx
+P99 ITL (ms):                            xxx
+==================================================
+```
+
+### Customizing the benchmark
+
+In order to change the parameters of the benchmark, such as the model, number of
+prompts, etc, you can modify the settings in values.yaml.
+
+### Cleanup
+When you are done running the benchmark, the server will still be running. You
+can cleanup by running:
+
+```
+helm uninstall ${RUN_NAME}
+```
diff --git a/inference/trillium/vLLM/Qwen3-Coder-480B/templates/benchmark.yaml b/inference/trillium/vLLM/Qwen3-Coder-480B/templates/benchmark.yaml
@@ -0,0 +1,256 @@
+# yamllint disable
+{{- if not .Values.is_multi_host }}
+apiVersion: storage.k8s.io/v1
+kind: StorageClass
+metadata:
+  name: "{{ .Release.Name }}-hyperdisk-balanced-tpu"
+provisioner: pd.csi.storage.gke.io
+parameters:
+  type: hyperdisk-balanced
+  # provisioned-iops: "3000" # Optional: Adjust IOPS as needed
+  # provisioned-throughput: "140" # Optional: Adjust Throughput (MiB/s) as needed
+reclaimPolicy: Delete
+volumeBindingMode: WaitForFirstConsumer
+allowVolumeExpansion: true
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: "{{ .Release.Name }}-hd-claim"
+spec:
+  storageClassName: "{{ .Release.Name }}-hyperdisk-balanced-tpu"
+  accessModes:
+  - ReadWriteOnce
+  resources:
+    requests:
+      storage: {{.Values.server.data_disk_size}}
+---
+{{- end }}
+apiVersion: v1
+kind: Secret
+metadata:
+  name: "{{ .Release.Name }}-hf-secret"
+type: Opaque
+stringData:
+  hf_api_token: {{.Values.hf_token}}
+---
+{{- if .Values.gcp_service_account }}
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: {{ .Values.k8s_service_account | quote }}
+  namespace: default
+  annotations:
+    iam.gke.io/gcp-service-account: {{ .Values.gcp_service_account }}
+{{- end }}
+---
+{{- if not .Values.is_multi_host }}
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: "{{ .Release.Name }}-tpu-server"
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: "{{ .Release.Name }}-tpu-pod"
+  template:
+    metadata:
+      labels:
+        app: "{{ .Release.Name }}-tpu-pod"
+    spec:
+      {{- if .Values.gcp_service_account }}
+      serviceAccountName: {{ .Values.k8s_service_account | quote }}
+      {{- end }}
+      nodeSelector:
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: {{.Values.server.topology}}
+      containers:
+      - name: vllm-tpu
+        image: {{.Values.server.image}}
+        command: ["/bin/bash", "-c"]
+        args:
+        - {{ .Values.server.bash_command | quote }}
+        env:
+        - name: HF_HOME
+          value: /data
+        - name: HUGGING_FACE_HUB_TOKEN
+          valueFrom:
+            secretKeyRef:
+              name: "{{ .Release.Name }}-hf-secret"
+              key: hf_api_token
+        {{if .Values.server.model_impl_type}}
+        - name: MODEL_IMPL_TYPE
+          value: {{.Values.server.model_impl_type}}
+        {{end}}
+        ports:
+        - containerPort: 8000
+        resources:
+          limits:
+            google.com/tpu: "{{.Values.server.num_chips_per_node}}"
+          requests:
+            google.com/tpu: "{{.Values.server.num_chips_per_node}}"
+        readinessProbe:
+          tcpSocket:
+            port: 8000
+          initialDelaySeconds: 15
+          periodSeconds: 10
+        volumeMounts:
+        - mountPath: "/data"
+          name: data-volume
+        - mountPath: /dev/shm
+          name: dshm
+      volumes:
+      - emptyDir:
+          medium: Memory
+        name: dshm
+      - name: data-volume
+        persistentVolumeClaim:
+          claimName: "{{ .Release.Name }}-hd-claim"
+{{- else }}
+apiVersion: leaderworkerset.x-k8s.io/v1
+kind: LeaderWorkerSet
+metadata:
+  name: "{{ .Release.Name }}-tpu-server"
+  annotations:
+    leaderworkerset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool
+spec:
+  replicas: 1
+  leaderWorkerTemplate:
+    size: {{ .Values.server.num_nodes }}
+    restartPolicy: RecreateGroupOnPodRestart
+    leaderTemplate:
+      metadata:
+        labels:
+          role: leader
+          app: "{{ .Release.Name }}-tpu-pod"
+      spec:
+        {{- if .Values.gcp_service_account }}
+        serviceAccountName: {{ .Values.k8s_service_account | quote }}
+        {{- end }}
+        nodeSelector:
+          cloud.google.com/gke-tpu-accelerator: tpu7x
+          cloud.google.com/gke-tpu-topology: {{.Values.server.topology}}
+        containers:
+        - name: vllm-leader
+          image: {{.Values.server.image}}
+          env:
+          - name: HUGGING_FACE_HUB_TOKEN
+            valueFrom:
+              secretKeyRef:
+                name: "{{ .Release.Name }}-hf-secret"
+                key: hf_api_token
+          - name: TPU_MULTIHOST_BACKEND
+            value: ray
+          - name: TPU_BACKEND_TYPE
+            value: jax
+          - name: JAX_PLATFORMS
+            value: ""
+          {{if .Values.server.model_impl_type}}
+          - name: MODEL_IMPL_TYPE
+            value: {{.Values.server.model_impl_type}}
+          {{end}}
+          command: ["/bin/bash", "-c"]
+          args:
+          - {{ .Values.server.bash_command | quote }}
+          resources:
+            limits:
+              google.com/tpu: "{{.Values.server.num_chips_per_node}}"
+            requests:
+              google.com/tpu: "{{.Values.server.num_chips_per_node}}"
+          ports:
+          - containerPort: 8000
+          readinessProbe:
+            tcpSocket:
+              port: 8000
+            initialDelaySeconds: 15
+            periodSeconds: 10
+          volumeMounts:
+          - mountPath: /dev/shm
+            name: dshm
+        volumes:
+        - emptyDir:
+            medium: Memory
+          name: dshm
+    workerTemplate:
+      spec:
+        {{- if .Values.gcp_service_account }}
+        serviceAccountName: {{ .Values.k8s_service_account | quote }}
+        {{- end }}
+        nodeSelector:
+          cloud.google.com/gke-tpu-accelerator: tpu7x
+          cloud.google.com/gke-tpu-topology: {{.Values.server.topology}}
+        containers:
+        - name: vllm-worker
+          image: {{.Values.server.image}}
+          command:
+            - sh
+            - -c
+            - {{ .Values.server.worker_bash_command | quote }}
+          resources:
+            limits:
+              google.com/tpu: "{{.Values.server.num_chips_per_node}}"
+            requests:
+              google.com/tpu: "{{.Values.server.num_chips_per_node}}"
+          env:
+          - name: HUGGING_FACE_HUB_TOKEN
+            valueFrom:
+              secretKeyRef:
+                name: "{{ .Release.Name }}-hf-secret"
+                key: hf_api_token
+          - name: TPU_MULTIHOST_BACKEND
+            value: ray
+          - name: TPU_BACKEND_TYPE
+            value: jax
+          - name: JAX_PLATFORMS
+            value: ""
+          volumeMounts:
+          - mountPath: /dev/shm
+            name: dshm
+        volumes:
+        - emptyDir:
+            medium: Memory
+          name: dshm
+
+{{- end }}
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: "{{ .Release.Name }}-vllm-service"
+spec:
+  selector:
+    app: "{{ .Release.Name }}-tpu-pod"
+    {{- if .Values.is_multi_host }}
+    role: leader
+    {{- end }}
+  type: LoadBalancer
+  ports:
+  - name: http
+    protocol: TCP
+    port: 8000
+    targetPort: 8000
+---
+apiVersion: v1
+kind: Pod
+metadata:
+  name: "{{ .Release.Name }}-client"
+spec:
+  {{- if .Values.gcp_service_account }}
+  serviceAccountName: {{ .Values.k8s_service_account | quote }}
+  {{- end }}
+  terminationGracePeriodSeconds: 60
+  containers:
+  - name: vllm-bench
+    image: {{.Values.client.image}}
+    command: ["/bin/bash", "-c"]
+    args: ["{{.Values.client.bash_command}}"]
+    env:
+    - name: HUGGING_FACE_HUB_TOKEN
+      valueFrom:
+        secretKeyRef:
+          key: hf_api_token
+          name: "{{ .Release.Name }}-hf-secret"
+    - name: SERVER_HOSTNAME
+      value: "{{ .Release.Name }}-vllm-service.default.svc.cluster.local"
+  restartPolicy: Never