From 556095740f5e1aa8369c98a99d27a599858d2c05 Mon Sep 17 00:00:00 2001
From: lepan-google <lepan@google.com>
Date: Wed, 3 Dec 2025 07:35:13 +0000
Subject: [PATCH 01/10] [A4X TensorRT Inference Benchmark] A4X DeepSeek R1
 NVFP4 on TensorRT with GCSFuse storage

This change added deployment configs and instructions for A4X DeepSeek R1 NVFP4 on TensorRT with GCSFuse storage. I followed [previous training storage recipe PR](https://github.com/AI-Hypercomputer/gpu-recipes/pull/37) and modified based on existing [CMCS recipe with HuggingFace](https://github.com/AI-Hypercomputer/gpu-recipes/pull/50)

TESTED=unit tests
---
 .../tensorrt-llm-gcs/README.md                | 409 ++++++++++++++++++
 .../tensorrt-llm-gcs/values.yaml              |  78 ++++
 .../deployment/Chart.yaml                     |  20 +
 .../templates/serving-config-configmap.yaml   |  25 ++
 .../templates/serving-launcher-configmap.yaml |  27 ++
 .../templates/serving-launcher.yaml           | 267 ++++++++++++
 .../deployment/templates/serving-svc.yaml     |  26 ++
 .../storage/gcs-fuse/templates/pv.yaml        |  39 +-
 .../storage/gcs-fuse/templates/pvc.yaml       |   4 +-
 src/helm-charts/storage/gcs-fuse/values.yaml  |   5 +-
 src/launchers/trtllm-launcher-gcs.sh          | 211 +++++++++
 11 files changed, 1100 insertions(+), 11 deletions(-)
 create mode 100644 inference/a4x/single-host-serving/tensorrt-llm-gcs/README.md
 create mode 100644 inference/a4x/single-host-serving/tensorrt-llm-gcs/values.yaml
 create mode 100644 src/helm-charts/a4x/inference-templates-gcs/deployment/Chart.yaml
 create mode 100644 src/helm-charts/a4x/inference-templates-gcs/deployment/templates/serving-config-configmap.yaml
 create mode 100644 src/helm-charts/a4x/inference-templates-gcs/deployment/templates/serving-launcher-configmap.yaml
 create mode 100644 src/helm-charts/a4x/inference-templates-gcs/deployment/templates/serving-launcher.yaml
 create mode 100644 src/helm-charts/a4x/inference-templates-gcs/deployment/templates/serving-svc.yaml
 create mode 100644 src/launchers/trtllm-launcher-gcs.sh
diff --git a/inference/a4x/single-host-serving/tensorrt-llm-gcs/README.md b/inference/a4x/single-host-serving/tensorrt-llm-gcs/README.md
new file mode 100644
index 0000000..97ff76e
--- /dev/null
+++ b/inference/a4x/single-host-serving/tensorrt-llm-gcs/README.md
@@ -0,0 +1,409 @@
+# Single Host Model Serving with NVIDIA TensorRT-LLM (TRT-LLM) on A4x GKE Node Pool using Google Cloud Storage
+
+This document outlines the steps to serve and benchmark various Large Language Models (LLMs) using the [NVIDIA TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) framework on a single [A4x GKE Node pool](https://cloud.google.com/kubernetes-engine)
+and using Google Cloud Storage to hold the model folder.
+
+This guide walks you through setting up the necessary cloud infrastructure, configuring your environment, and deploying a high-performance LLM for inference.
+
+<a name="table-of-contents"></a>
+## Table of Contents
+
+- [Single Host Model Serving with NVIDIA TensorRT-LLM (TRT-LLM) on A4x GKE Node Pool using Google Cloud Storage](#single-host-model-serving-with-nvidia-tensorrt-llm-trt-llm-on-a4x-gke-node-pool-using-google-cloud-storage)
+  - [Table of Contents](#table-of-contents)
+  - [1. Test Environment](#1-test-environment)
+  - [2. High-Level Flow](#2-high-level-flow)
+  - [3. Environment Setup (One-Time)](#3-environment-setup-one-time)
+    - [3.1. Clone the Repository](#31-clone-the-repository)
+    - [3.2. Configure Environment Variables](#32-configure-environment-variables)
+    - [3.3. Connect to your GKE Cluster](#33-connect-to-your-gke-cluster)
+    - [3.4 Upload the model checkpoints](#34-upload-the-model-checkpoints)
+    - [3.5 Create Persistent Volumes and Persistent Volume Claims](#35-create-persistent-volumes-and-persistent-volume-claims)
+    - [3.6 Grant Storage Permission to Kubernetes Service Account](#36-grant-storage-permission-to-kubernetes-service-account)
+  - [4. Run the recipe](#4-run-the-recipe)
+    - [4.1. Inference benchmark for DeepSeek-R1 671B Model](#41-inference-benchmark-for-deepseek-r1-671b-model)
+  - [5. Monitoring and Troubleshooting](#5-monitoring-and-troubleshooting)
+    - [5.1. Check Deployment Status](#51-check-deployment-status)
+    - [5.2. View Logs](#52-view-logs)
+  - [6. Cleanup](#6-cleanup)
+
+<a name="test-environment"></a>
+## 1. Test Environment
+
+[Back to Top](#table-of-contents)
+
+The recipe uses the following setup:
+
+* **Orchestration**: [Google Kubernetes Engine (GKE)](https://cloud.google.com/kubernetes-engine)
+* **Deployment Configuration**: A [Helm chart](https://helm.sh/) is used to configure and deploy a [Kubernetes Deployment](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/). This deployment encapsulates the inference of the target LLM using the TensorRT-LLM framework.
+  Another Helm chart is used to create persistent volume and persistent volume claim for model folder in GCS.
+
+This recipe has been optimized for and tested with the following configuration:
+
+* **GKE Cluster**:
+    * A [regional standard cluster](https://cloud.google.com/kubernetes-engine/docs/concepts/configuration-overview) version: `1.33.4-gke.1036000` or later.
+    * A GPU node pool with 1 [a4x-highgpu-4g](https://cloud.google.com/compute/docs/gpus) machine.
+    * [Workload Identity Federation for GKE](https://cloud.google.com/kubernetes-engine/docs/concepts/workload-identity) enabled.
+    * [Cloud Storage FUSE CSI driver for GKE](https://cloud.google.com/kubernetes-engine/docs/concepts/cloud-storage-fuse-csi-driver) enabled.
+    * [DCGM metrics](https://cloud.google.com/kubernetes-engine/docs/how-to/dcgm-metrics) enabled.
+    * [Kueue](https://kueue.sigs.k8s.io/docs/reference/kueue.v1beta1/) and [JobSet](https://jobset.sigs.k8s.io/docs/overview/) APIs installed.
+    * Kueue configured to support [Topology Aware Scheduling](https://kueue.sigs.k8s.io/docs/concepts/topology_aware_scheduling/).
+* A regional Google Cloud Storage (GCS) bucket to store logs generated by the recipe runs.
+* A regional Google Cloud Storage (GCS) bucket to store model folder.
+
+> [!IMPORTANT]
+> - To prepare the required environment, see the [GKE environment setup guide](../../../../docs/configuring-environment-gke-a4x.md).
+> Provisioning a new GKE cluster is a long-running operation and can take **20-30 minutes**.
+> - All GCS buckets must be in the same region as the GKE cluster.
+
+<a name="architecture"></a>
+## 2. High-Level Flow
+
+[Back to Top](#table-of-contents)
+
+Here is a simplified diagram of the flow that we follow in this recipe:
+
+```mermaid
+---
+config:
+  layout: dagre
+---
+flowchart TD
+ subgraph workstation["Client Workstation"]
+    T["Cluster Toolkit"]
+    B("Kubernetes API")
+    A["helm install"]
+  end
+ subgraph gke["GKE Cluster (A4x)"]
+    C["Deployment"]
+    D["Pod"]
+    E["TensorRT-LLM container"]
+    F["Service"]
+  end
+ subgraph storage["Cloud Storage"]
+    J["Model Bucket"]
+    H["Logs Bucket"]
+  end
+
+    %% Logical/actual flow
+    T -- Create Cluster --> gke
+    A --> B
+    B --> C & F
+    C --> D
+    D --> E
+    F --> C
+    H -- Downloads at runtime --> E
+    E -- Write logs --> J
+
+
+    %% Layout control
+    gke
+```
+
+* **helm:** A package manager for Kubernetes to define, install, and upgrade applications. It's used here to configure and deploy the Kubernetes Deployment.
+* **Deployment:** Manages the lifecycle of your model server pod, ensuring it stays running.
+* **Service:** Provides a stable network endpoint (a DNS name and IP address) to access your model server.
+* **Pod:** The smallest deployable unit in Kubernetes. The Triton server container with TensorRT-LLM runs inside this pod on a GPU-enabled node.
+* **Cloud Storage:** Cloud Storage buckets to store model folder, benchmark logs and other artifacts.
+
+<a name="environment-setup"></a>
+## 3. Environment Setup (One-Time)
+
+[Back to Top](#table-of-contents)
+
+First, you'll configure your local environment. These steps are required once before you can deploy any models.
+
+<a name="clone-repo"></a>
+### 3.1. Clone the Repository
+
+```bash
+git clone https://github.com/ai-hypercomputer/gpu-recipes.git
+cd gpu-recipes
+export REPO_ROOT=$(pwd)
+export RECIPE_ROOT=$REPO_ROOT/inference/a4x/single-host-serving/tensorrt-llm-gcs
+```
+
+<a name="configure-vars"></a>
+### 3.2. Configure Environment Variables
+
+This is the most critical step. These variables are used in subsequent commands to target the correct resources.
+
+```bash
+export PROJECT_ID=<PROJECT_ID>
+export CLUSTER_REGION=<CLUSTER_REGION>
+export CLUSTER_NAME=<CLUSTER_NAME>
+export KUEUE_NAME=<KUEUE_NAME>
+export GCS_BUCKET_LOGS=<GCS_BUCKET_LOGS>
+export GCS_BUCKET_SERVING_MODEL=<GCS_BUCKET_SERVING_MODEL>
+export GCS_FOLDER_SERVING_MODEL=<YOUR_GCS_FOLDER_SERVING_MODEL>
+export TRTLLM_VERSION=1.2.0rc2
+
+export PROJECT_ID=supercomputer-testing
+export CLUSTER_REGION=us-west8
+export CLUSTER_NAME=imo-glacier-peak
+export KUEUE_NAME=a4x
+export GCS_BUCKET_LOGS=tess-benchmark-outputs
+export GCS_BUCKET_SERVING_MODEL=serving-model-us-west8
+export GCS_FOLDER_SERVING_MODEL=cp-hf
+export TRTLLM_VERSION=1.2.0rc2
+
+# Set the project for gcloud commands
+gcloud config set project $PROJECT_ID
+```
+
+Replace the following values:
+
+| Variable              | Description                                                                                             | Example                                                 |
+| --------------------- | ------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- |
+| `PROJECT_ID` | Your Google Cloud Project ID. | `gcp-project-12345` |
+| `CLUSTER_REGION` | The GCP region where your GKE cluster is located. | `us-central1` |
+| `CLUSTER_NAME` | The name of your GKE cluster. | `a4x-cluster` |
+| `KUEUE_NAME` | The name of the Kueue local queue. The default queue created by the cluster toolkit is `a4x`. Verify the name in your cluster. | `a4x` |
+| `GCS_BUCKET_LOGS` | Name of your GCS logs bucket (do not include `gs://`). | `my-benchmark-logs-bucket` |
+| `GCS_BUCKET_SERVING_MODEL` | Name of your GCS model bucket (do not include `gs://`). | `my-benchmark-model-bucket` |
+| `GCS_FOLDER_SERVING_MODEL` | Name of your GCS model folder (do not include `gs://{your-model-bucket}`). | `my-benchmark-model-bucket` |
+| `TRTLLM_VERSION` | The tag/version for the Docker image. Other verions can be found at https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release | `1.2.0rc2` |
+
+
+<a name="connect-cluster"></a>
+### 3.3. Connect to your GKE Cluster
+
+Fetch credentials for `kubectl` to communicate with your cluster.
+
+```bash
+gcloud container clusters get-credentials $CLUSTER_NAME --region $CLUSTER_REGION
+```
+
+
+<a name="download-model-checkpoints"></a>
+### 3.4 Upload the model checkpoints
+In this recipe, we are using [DeepSeek-R1 671B model](https://huggingface.co/deepseek-ai/DeepSeek-R1) on HuggingFace.
+To download the model:
+- [mount the bucket](https://docs.cloud.google.com/storage/docs/cloud-storage-fuse/mount-bucket)
+to your local system.
+- Access into the mount path, create a folder to hold model.
+- [Download](https://huggingface.co/docs/hub/en/models-downloading)
+the model through `hf command`
+
+<a name="create-pv-pvc"></a>
+### 3.5 Create Persistent Volumes and Persistent Volume Claims
+
+The inference deployment accesses GCS buckets for model through
+[the Cloud Storage FUSE CSI driver](https://cloud.google.com/kubernetes-engine/docs/how-to/persistent-volumes/cloud-storage-fuse-csi-driver)
+configured using Kubernetes Persistent Volumes (PV) and Persistent Volume
+Claims (PVC). You must generate PVs and PVCs for serving bucket using the
+[gcs-fuse helper Helm chart](../../../../src/helm-charts/storage/gcs-fuse).
+The chart configures the FUSE driver settings following the best practices
+for optimizing access to buckets for training data and checkpoints.
+
+```
+helm install -f $REPO_ROOT/src/helm-charts/storage/gcs-fuse/values.yaml \
+--set gcsVolumes[2].bucketName=${GCS_BUCKET_SERVING_MODEL} \
+--set gcsVolumes[2].dirPath=${GCS_FOLDER_SERVING_MODEL} \
+$USER-gcs-pv-pvc \
+$REPO_ROOT/src/helm-charts/storage/gcs-fuse
+```
+
+<a name="grant-storage-permission"></a>
+### 3.6 Grant Storage Permission to Kubernetes Service Account
+
+For a cluster with
+[Workload Identity Federation](https://cloud.google.com/kubernetes-engine/docs/concepts/workload-identity)
+, you can grant `roles/storage.objectAdmin` access to Kubernetes service
+account following
+[instruction](https://cloud.google.com/kubernetes-engine/docs/concepts/workload-identity#kubernetes-resources-iam-policies).
+
+
+<a name="run-the-recipe"></a>
+## 4. Run the recipe
+
+[Back to Top](#table-of-contents)
+
+This recipe supports the deployment of the following models:
+
+1.  [DeepSeek-R1-NVFP4-v2](#serving-deepseek-r1)
+
+> [!NOTE]
+> After running the recipe with `helm install`, it can take **up to 30 minutes** for the deployment to become fully available. This is because the GKE node must first pull the Docker image.
+
+<a name="serving-deepseek-r1"></a>
+### 4.1. Inference benchmark for DeepSeek-R1 671B Model
+
+[Back to Top](#table-of-contents)
+
+The recipe runs inference throughput benchmark for [DeepSeek-R1 671B NVFP4 model](https://huggingface.co/nvidia/DeepSeek-R1-NVFP4-v2) which is Nvidia's pre-quantized FP4 checkpoint of the original [DeepSeek-R1 671B model](https://huggingface.co/deepseek-ai/DeepSeek-R1).
+
+The recipe uses [`trtllm-bench`](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/performance/perf-benchmarking.md), a command-line tool from NVIDIA to benchmark the performance of TensorRT-LLM engine. For more information about `trtllm-bench`, see the [TensorRT-LLM documentation](https://github.com/NVIDIA/TensorRT-LLM).
+
+> [!NOTE]
+> The config file directly exposes the settings within TensorRT-LLM's llm_args.py class, which are passed to `trtllm-bench`, you can modify these as needed in [`src/frameworks/a4x/trtllm-configs/deepseek-r1-nvfp4.yaml`](../../../../src/frameworks/a4x/trtllm-configs/deepseek-r1-nvfp4.yaml)
+
+1. Install the helm chart to prepare and benchmark the model using [`trtllm-bench`](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/performance/perf-benchmarking.md) tool:
+
+    ```bash
+    --set queue=${KUEUE_NAME} \
+
+    cd $RECIPE_ROOT
+    helm install -f values.yaml \
+    --set-file workload_launcher=$REPO_ROOT/src/launchers/trtllm-launcher-gcs.sh \
+    --set-file serving_config=$REPO_ROOT/src/frameworks/a4x/trtllm-configs/deepseek-r1-nvfp4.yaml \
+    --set "volumes.gcsMounts[0].bucketName=${GCS_BUCKET_LOGS}" \
+    --set workload.model.name=nvidia/DeepSeek-R1-NVFP4-v2 \
+    --set workload.image=nvcr.io/nvidia/tensorrt-llm/release:${TRTLLM_VERSION} \
+    $USER-serving-deepseek-r1-model-39 \
+    $REPO_ROOT/src/helm-charts/a4x/inference-templates-gcs/deployment
+    ```
+
+  This creates a Helm release and a Deployment named `$USER-serving-deepseek-r1-model`, and a Service named `$USER-serving-deepseek-r1-model-svc`.
+
+2.  **Check the deployment status.**
+
+    ```bash
+    kubectl get deployment/$USER-serving-deepseek-r1-model
+    ```
+
+    Wait until the `READY` column shows `1/1`. See the [Monitoring and Troubleshooting](#monitoring) section to view the deployment logs.
+
+  > [!NOTE]
+  > - This helm chart is configured to run only a single benchmarking experiment for 1k requests for 128 tokens of input/output lengths. To run other experiments, you can add the various combinations provided in the [values.yaml](values.yaml) file.
+
+
+<a name="monitoring"></a>
+## 5. Monitoring and Troubleshooting
+
+[Back to Top](#table-of-contents)
+
+After the model is deployed via Helm as described in the sections [above](#run-the-recipe), use the following steps to monitor the deployment and interact with the model. Replace `<deployment-name>` and `<service-name>` with the appropriate names from the model-specific deployment instructions (e.g., `$USER-serving-deepseek-r1-model` and `$USER-serving-deepseek-r1-model-svc`).
+
+
+<a name="check-status"></a>
+### 5.1. Check Deployment Status
+
+Check the status of your deployment. Replace the name if you deployed a different model.
+
+```bash
+# Example for DeepSeek-R1 671B
+kubectl get deployment/$USER-serving-deepseek-r1-model
+```
+
+Wait until the `READY` column shows `1/1`. If it shows `0/1`, the pod is still starting up.
+
+> [!NOTE]
+> In the GKE UI on Cloud Console, you might see a status of "Does not have minimum availability" during startup. This is normal and will resolve once the pod is ready.
+
+<a name="view-logs"></a>
+### 5.2. View Logs
+
+To see the logs from the TRTLLM server (useful for debugging), use the `-f` flag to follow the log stream:
+
+```bash
+kubectl logs -f deployment/$USER-serving-deepseek-r1-model
+```
+
+You should see logs indicating preparing the model, and then running the throughput benchmark test, similar to this:
+
+```bash
+Running benchmark for nvidia/DeepSeek-R1-NVFP4-v2 with ISL=128, OSL=128, TP=4, EP=4, PP=1
+
+===========================================================
+= PYTORCH BACKEND
+===========================================================
+Model:			nvidia/DeepSeek-R1-NVFP4-v2
+Model Path:		/ssd/nvidia/DeepSeek-R1-NVFP4-v2
+TensorRT LLM Version:	1.2
+Dtype:			bfloat16
+KV Cache Dtype:		FP8
+Quantization:		NVFP4
+
+===========================================================
+= REQUEST DETAILS
+===========================================================
+Number of requests:             1000
+Number of concurrent requests:  985.9849
+Average Input Length (tokens):  128.0000
+Average Output Length (tokens): 128.0000
+===========================================================
+= WORLD + RUNTIME INFORMATION
+===========================================================
+TP Size:                4
+PP Size:                1
+EP Size:                4
+Max Runtime Batch Size: 2304
+Max Runtime Tokens:     4608
+Scheduling Policy:      GUARANTEED_NO_EVICT
+KV Memory Percentage:   85.00%
+Issue Rate (req/sec):   8.3913E+13
+
+===========================================================
+= PERFORMANCE OVERVIEW
+===========================================================
+Request Throughput (req/sec):                     X.XX
+Total Output Throughput (tokens/sec):             X.XX
+Total Token Throughput (tokens/sec):              X.XX
+Total Latency (ms):                               X.XX
+Average request latency (ms):                     X.XX
+Per User Output Throughput [w/ ctx] (tps/user):   X.XX
+Per GPU Output Throughput (tps/gpu):              X.XX
+
+-- Request Latency Breakdown (ms) -----------------------
+
+[Latency] P50    : X.XX
+[Latency] P90    : X.XX
+[Latency] P95    : X.XX
+[Latency] P99    : X.XX
+[Latency] MINIMUM: X.XX
+[Latency] MAXIMUM: X.XX
+[Latency] AVERAGE: X.XX
+
+===========================================================
+= DATASET DETAILS
+===========================================================
+Dataset Path:         /ssd/token-norm-dist_DeepSeek-R1-NVFP4-v2_128_128_tp4.json
+Number of Sequences:  1000
+
+-- Percentiles statistics ---------------------------------
+
+        Input              Output           Seq. Length
+-----------------------------------------------------------
+MIN:   128.0000           128.0000           256.0000
+MAX:   128.0000           128.0000           256.0000
+AVG:   128.0000           128.0000           256.0000
+P50:   128.0000           128.0000           256.0000
+P90:   128.0000           128.0000           256.0000
+P95:   128.0000           128.0000           256.0000
+P99:   128.0000           128.0000           256.0000
+===========================================================
+```
+
+<a name="cleanup"></a>
+## 6. Cleanup
+
+To avoid incurring further charges, clean up the resources you created.
+
+1.  **Uninstall the Helm Release:**
+
+    First, list your releases to get the deployed models:
+
+    ```bash
+    # list deployed models
+    helm list --filter $USER-serving-
+    ```
+
+    Then, uninstall the desired release:
+
+    ```bash
+    # uninstall the deployed model
+    helm uninstall <release_name>
+    ```
+    Replace `<release_name>` with the helm release names listed.
+
+2.  **Delete the Persistent Volume and Persistent Volume Claim:**
+
+    ```bash
+    # uninstall the deployed pv and pvc.
+    helm uninstall $USER-gcs-pv-pvc
+    ```
+
+3.  (Optional) Delete the built Docker image from Artifact Registry if no longer needed.
+4.  (Optional) Delete Cloud Build logs.
+5.  (Optional) Clean up files in your GCS bucket if benchmarking was performed.
+6.  (Optional) Delete the [test environment](#test-environment) provisioned including GKE cluster.
diff --git a/inference/a4x/single-host-serving/tensorrt-llm-gcs/values.yaml b/inference/a4x/single-host-serving/tensorrt-llm-gcs/values.yaml
new file mode 100644
index 0000000..47d0d59
--- /dev/null
+++ b/inference/a4x/single-host-serving/tensorrt-llm-gcs/values.yaml
@@ -0,0 +1,78 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+queue:
+
+dwsSettings:
+  maxRunDurationSeconds:
+
+volumes:
+  gcsVolumes: true
+  ssdMountPath: "/ssd"
+  gcsMounts:
+    - bucketName:
+      mountPath: "/gcs-logs"
+  pvcMounts:
+    - claimName: "gcs-serving-model-pvc"
+      mountPath: "/serving-model"
+
+service:
+  type: ClusterIP
+  ports:
+    http: 8000
+
+workload:
+  model:
+    name:
+  gpus: 4
+  image:
+  framework: trtllm
+  configFile: serving-args.yaml
+  configPath: /workload/configs
+  envs:
+    - name: LAUNCHER_SCRIPT
+      value: "/workload/launcher/launch-workload.sh"
+    - name: SERVER_ARGS_FILE
+      value: "/workload/configs/serving-args.yaml"
+  benchmarks:
+    experiments:
+      - isl: 128
+        osl: 128
+        num_requests: 1000
+
+network:
+  gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-arm64:v1.0.7
+  ncclSettings:
+    - name: NCCL_DEBUG
+      value: "VERSION"
+
+targetNodes:
+- gke-imo-glacier-peak-a4x-highgpu-4g-a-76a6f770-0phl
+- gke-imo-glacier-peak-a4x-highgpu-4g-a-76a6f770-12dg
+- gke-imo-glacier-peak-a4x-highgpu-4g-a-76a6f770-4ncf
+- gke-imo-glacier-peak-a4x-highgpu-4g-a-76a6f770-6t1h
+- gke-imo-glacier-peak-a4x-highgpu-4g-a-76a6f770-7knw
+- gke-imo-glacier-peak-a4x-highgpu-4g-a-76a6f770-8z5k
+- gke-imo-glacier-peak-a4x-highgpu-4g-a-76a6f770-jt1t
+- gke-imo-glacier-peak-a4x-highgpu-4g-a-76a6f770-km6k
+- gke-imo-glacier-peak-a4x-highgpu-4g-a-76a6f770-lr4w
+- gke-imo-glacier-peak-a4x-highgpu-4g-a-76a6f770-lzj8
+- gke-imo-glacier-peak-a4x-highgpu-4g-a-76a6f770-nghr
+- gke-imo-glacier-peak-a4x-highgpu-4g-a-76a6f770-q1rz
+- gke-imo-glacier-peak-a4x-highgpu-4g-a-76a6f770-rvlb
+- gke-imo-glacier-peak-a4x-highgpu-4g-a-76a6f770-sjcg
+- gke-imo-glacier-peak-a4x-highgpu-4g-a-76a6f770-v549
+- gke-imo-glacier-peak-a4x-highgpu-4g-a-76a6f770-vd4h
+- gke-imo-glacier-peak-a4x-highgpu-4g-a-76a6f770-x1qx
+- gke-imo-glacier-peak-a4x-highgpu-4g-a-76a6f770-z47k
diff --git a/src/helm-charts/a4x/inference-templates-gcs/deployment/Chart.yaml b/src/helm-charts/a4x/inference-templates-gcs/deployment/Chart.yaml
new file mode 100644
index 0000000..4f584cc
--- /dev/null
+++ b/src/helm-charts/a4x/inference-templates-gcs/deployment/Chart.yaml
@@ -0,0 +1,20 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v2
+name: single-host-serving-deployment-template
+description: single-host-serving-deployment-template
+type: application
+version: 0.1.0
+appVersion: "1.16.0"
diff --git a/src/helm-charts/a4x/inference-templates-gcs/deployment/templates/serving-config-configmap.yaml b/src/helm-charts/a4x/inference-templates-gcs/deployment/templates/serving-config-configmap.yaml
new file mode 100644
index 0000000..a17bdf4
--- /dev/null
+++ b/src/helm-charts/a4x/inference-templates-gcs/deployment/templates/serving-config-configmap.yaml
@@ -0,0 +1,25 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: "{{ .Release.Name }}-config"
+data:
+  serving-configuration: |-
+{{- if .Values.serving_config  }}
+{{ .Values.serving_config | nindent 4 }}
+{{- else }}
+{{ "config: null" | nindent 4 }}
+{{- end }}
\ No newline at end of file
diff --git a/src/helm-charts/a4x/inference-templates-gcs/deployment/templates/serving-launcher-configmap.yaml b/src/helm-charts/a4x/inference-templates-gcs/deployment/templates/serving-launcher-configmap.yaml
new file mode 100644
index 0000000..b111553
--- /dev/null
+++ b/src/helm-charts/a4x/inference-templates-gcs/deployment/templates/serving-launcher-configmap.yaml
@@ -0,0 +1,27 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: "{{ .Release.Name }}-launcher"
+data:
+  launch-workload.sh: |-
+{{- if .Values.workload_launcher }}
+{{ .Values.workload_launcher | nindent 4 }}
+{{- else }}
+    #!/bin/bash
+    echo "No workload launcher specified"
+    exit 1
+{{- end }}
\ No newline at end of file
diff --git a/src/helm-charts/a4x/inference-templates-gcs/deployment/templates/serving-launcher.yaml b/src/helm-charts/a4x/inference-templates-gcs/deployment/templates/serving-launcher.yaml
new file mode 100644
index 0000000..c64a237
--- /dev/null
+++ b/src/helm-charts/a4x/inference-templates-gcs/deployment/templates/serving-launcher.yaml
@@ -0,0 +1,267 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+{{ $nodes := div .Values.workload.gpus 4 | max 1 }}
+{{ $gpusPerNode := min .Values.workload.gpus 4 }}
+
+{{ $root := . }}
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: "{{ .Release.Name }}"
+  namespace: default
+  labels:
+    app: {{ .Release.Name }}-serving
+    {{- if $root.Values.queue }}
+    kueue.x-k8s.io/queue-name: "{{ $root.Values.queue }}"
+    {{- end }}
+spec:
+  replicas: {{ $nodes }}
+  selector:
+    matchLabels:
+      app: {{ .Release.Name }}-serving
+  template:
+    metadata:
+      labels:
+        app: {{ .Release.Name }}-serving
+      annotations:
+        kubectl.kubernetes.io/default-container: serving
+        {{- if or $root.Values.volumes.gcsVolumes $root.Values.volumes.pvcVolumes}}
+        gke-gcsfuse/volumes: "true"
+        gke-gcsfuse/cpu-limit: "0"
+        gke-gcsfuse/memory-limit: "0"
+        gke-gcsfuse/ephemeral-storage-limit: "0"
+        {{- end }}
+        {{- if and $root.Values.queue $root.Values.dwsSettings.maxRunDurationSeconds }}
+        provreq.kueue.x-k8s.io/maxRunDurationSeconds: "{{ $root.Values.dwsSettings.maxRunDurationSeconds }}"
+        {{- end }}
+        {{- if not $root.Values.network.hostNetwork }}
+        networking.gke.io/default-interface: "eth0"
+        {{- end }}
+    spec:
+      {{- if $root.Values.network.hostNetwork }}
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet
+      {{- end }}
+      subdomain: "{{.Release.Name}}"
+      restartPolicy: Always
+      {{- if $root.Values.targetNodes }}
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: kubernetes.io/hostname
+                operator: In
+                values:
+                {{ range $hostname := $root.Values.targetNodes }}
+                - {{ $hostname }}
+                {{ end }}
+      {{- end }}
+      tolerations:
+      - operator: "Exists"
+        key: nvidia.com/gpu
+      - operator: "Exists"
+        key: cloud.google.com/impending-node-termination
+      - key: "kubernetes.io/arch"
+        operator: "Equal"
+        value: "arm64"
+        effect: "NoSchedule"
+      volumes:
+        {{- if $root.Values.network.gibVersion }}
+        - name: gib
+          emptyDir: {}
+        {{- end }}
+        - name: serving-configuration
+          configMap:
+            name: "{{.Release.Name}}-config"
+            items:
+            - key: serving-configuration
+              path: {{ $root.Values.workload.configFile | default "serving-args" }}
+        - name: serving-launcher
+          configMap:
+            name: "{{.Release.Name}}-launcher"
+            defaultMode: 0700
+        - name: shared-memory
+          emptyDir:
+            medium: "Memory"
+            sizeLimit: 250Gi
+        {{- range $gcs := $root.Values.volumes.gcsMounts }}
+        - name: "{{ $gcs.bucketName }}"
+          csi:
+            driver: gcsfuse.csi.storage.gke.io
+            volumeAttributes:
+              bucketName: "{{ $gcs.bucketName }}"
+              {{- if $gcs.mountOptions }}
+              mountOptions: "{{ $gcs.mountOptions }}"
+              {{- end }}
+        {{- end }}
+        {{ if (gt (len $root.Values.volumes.pvcMounts) 0) }}
+        - name: "{{ (index $root.Values.volumes.pvcMounts 0).claimName }}"
+          persistentVolumeClaim:
+            claimName: "{{ (index $root.Values.volumes.pvcMounts 0).claimName }}"
+        {{- end }}
+        {{- if $root.Values.volumes.ssdMountPath }}
+        - name: local-ssd
+          hostPath:
+            path: /mnt/stateful_partition/kube-ephemeral-ssd
+        {{- end }}
+
+      initContainers:
+      {{- if $root.Values.network.gibVersion }}
+      - name: nccl-plugin-installer
+        image: {{ $root.Values.network.gibVersion }}
+        imagePullPolicy: Always
+        args:
+        - |
+          set -ex
+          /scripts/container_entry.sh install --install-nccl
+          cp -R /var/lib/gib/lib64/. /target/usr/local/gib/lib64
+          cp -R /var/lib/gib/. /target/usr/local/gib
+        command:
+        - /bin/sh
+        - -c
+        volumeMounts:
+        - mountPath: /target/usr/local/gib
+          name: gib
+      {{- end }}
+
+      containers:
+        {{- if $root.Values.workload.gcsSidecarImage }}
+        - name: gke-gcsfuse-sidecar
+          image: {{ $root.Values.workload.gcsSidecarImage }}
+        - name: gke-gcsfuse-metadata-prefetch
+          image: {{ $root.Values.workload.gcsSidecarImage }}
+        {{- end }}
+        - name: serving
+          image: "{{ $root.Values.workload.image }}"
+          imagePullPolicy: Always
+          {{- if $root.Values.network.hostNetwork }}
+          securityContext:
+            privileged: true
+          {{- end }}
+          env:
+            {{- if $root.Values.network.ncclSettings }}
+            {{- toYaml .Values.network.ncclSettings | nindent 12 }}
+            {{- end }}
+            - name: NCCL_PLUGIN_PATH
+              value: /usr/local/gib/lib64
+            - name: LD_LIBRARY_PATH
+              value: /usr/local/gib/lib64:/usr/local/nvidia/lib64
+            {{- if $root.Values.network.gibVersion }}
+            - name: NCCL_INIT_SCRIPT
+              value: "/usr/local/gib/scripts/set_nccl_env.sh"
+            {{- end }}
+            # Workload specific environment variables
+            - name: MODEL_NAME
+              value: "{{ $root.Values.workload.model.name }}"
+            - name: TRTLLM_DIR
+              value: "/app/tensorrt_llm"
+            {{- if $root.Values.workload.envs }}
+            {{- toYaml .Values.workload.envs | nindent 12 }}
+            {{- end }}
+
+          workingDir: /workload
+          command: ["/bin/bash", "-c"]
+          args:
+            - |
+              #!/bin/bash
+
+              if [ ! -f "$LAUNCHER_SCRIPT" ]; then
+                echo "Error: Launcher script $LAUNCHER_SCRIPT not found!"
+                exit 1
+              fi
+
+              ARGS=()
+              EXTRA_ARGS_FILE="/tmp/extra_llm_api_args.yaml"
+
+              # Use Python to parse the main config file, extract llm_api_args,
+              # and generate the command-line arguments.
+              python -c "
+              import yaml
+              import sys
+
+              args = []
+              llm_api_args = {}
+              config_file = sys.argv[1]
+              extra_args_file = sys.argv[2]
+
+              try:
+                with open(config_file, 'r') as f:
+                  config = yaml.safe_load(f)
+
+                if 'llm_api_args' in config:
+                  llm_api_args = config.pop('llm_api_args')
+                  with open(extra_args_file, 'w') as f:
+                    yaml.dump(llm_api_args, f)
+
+                for key, value in config.items():
+                  if value is True:
+                    args.append(f'--{key}')
+                  elif value is not False:
+                    args.append(f'--{key}')
+                    args.append(str(value))
+
+                # Print the arguments for the shell script to capture
+                print(' '.join(args))
+
+              except Exception as e:
+                print(f'Error parsing config file: {e}', file=sys.stderr)
+                sys.exit(1)
+              " "$SERVER_ARGS_FILE" "$EXTRA_ARGS_FILE" > /tmp/launcher_args.txt
+
+              # Read the generated arguments into the ARGS array
+              mapfile -t ARGS < <(tr ' ' '\n' < /tmp/launcher_args.txt)
+              rm /tmp/launcher_args.txt
+
+              {{ if eq $root.Values.workload.framework "trtllm" }}
+              {{- range $root.Values.workload.benchmarks.experiments }}
+              echo "Running: $LAUNCHER_SCRIPT --model_name $MODEL_NAME --isl {{ .isl }} --osl {{ .osl }} --num_requests {{ .num_requests }} -- ${ARGS[@]}"
+              exec "$LAUNCHER_SCRIPT" --model_name $MODEL_NAME --isl {{ .isl }} --osl {{ .osl }} --num_requests {{ .num_requests }} -- "${ARGS[@]}"
+              {{- end }}
+              {{ else }}
+              echo "Running: $LAUNCHER_SCRIPT ${ARGS[@]}"
+              exec "$LAUNCHER_SCRIPT" "${ARGS[@]}"
+              {{- end }}
+
+          volumeMounts:
+            {{- if $root.Values.network.gibVersion }}
+            - name: gib
+              mountPath: /usr/local/gib
+            {{- end }}
+            - name: serving-configuration
+              mountPath: {{ $root.Values.workload.configPath | default "/workload/configs" }}
+            - name: serving-launcher
+              mountPath: /workload/launcher
+            - name: shared-memory
+              mountPath: /dev/shm
+            {{- range $gcs := $root.Values.volumes.gcsMounts }}
+            - name: "{{ $gcs.bucketName }}"
+              mountPath: "{{ $gcs.mountPath }}"
+            {{- end }}
+            {{ if (gt (len $root.Values.volumes.pvcMounts) 0) }}
+            - name: "{{ (index $root.Values.volumes.pvcMounts 0).claimName }}"
+              mountPath: "{{ (index $root.Values.volumes.pvcMounts 0).mountPath }}"
+            {{- end }}
+            {{- if $root.Values.volumes.ssdMountPath }}
+            - name: local-ssd
+              mountPath: "{{ $root.Values.volumes.ssdMountPath }}"
+            {{- end }}
+
+          resources:
+            requests:
+              nvidia.com/gpu: {{ $gpusPerNode }}
+            limits:
+              nvidia.com/gpu: {{ $gpusPerNode }}
diff --git a/src/helm-charts/a4x/inference-templates-gcs/deployment/templates/serving-svc.yaml b/src/helm-charts/a4x/inference-templates-gcs/deployment/templates/serving-svc.yaml
new file mode 100644
index 0000000..3d1363b
--- /dev/null
+++ b/src/helm-charts/a4x/inference-templates-gcs/deployment/templates/serving-svc.yaml
@@ -0,0 +1,26 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Service
+metadata:
+  name: {{ .Release.Name }}-svc
+spec:
+  selector:
+    app: {{ .Release.Name }}-serving
+  ports:
+    - name: http
+      port: {{ .Values.service.ports.http }}
+      targetPort: {{ .Values.service.ports.http }}
+  type: {{ .Values.service.type }}
\ No newline at end of file
diff --git a/src/helm-charts/storage/gcs-fuse/templates/pv.yaml b/src/helm-charts/storage/gcs-fuse/templates/pv.yaml
index 4e0d3b4..efafd9f 100644
--- a/src/helm-charts/storage/gcs-fuse/templates/pv.yaml
+++ b/src/helm-charts/storage/gcs-fuse/templates/pv.yaml
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 {{- range $gcs := .Values.gcsVolumes }}
+{{- if $gcs.bucketName }}
 apiVersion: v1
 kind: PersistentVolume
 metadata:
@@ -45,24 +46,43 @@ spec:
       gcsfuseMetadataPrefetchOnMount: "true"
 {{- else if eq $gcs.type "checkpoints" }}
   mountOptions:
-    - implicit-dirs  
-    - metadata-cache:negative-ttl-secs:0 
-    - metadata-cache:ttl-secs:-1 
+    - implicit-dirs
+    - metadata-cache:negative-ttl-secs:0
+    - metadata-cache:ttl-secs:-1
     - metadata-cache:stat-cache-max-size-mb:-1
     - metadata-cache:type-cache-max-size-mb:-1
     - write:enable-streaming-writes:true
     # This workaround is for gcsfuse v3.5.0 and below versions.
     # Earlier GCSFuse versions do not recognize a4-highgpu-8g as a high-performance machine.
-    # Setting machine-type to a3-highgpu-8g increases the global-max-block, which is 
+    # Setting machine-type to a3-highgpu-8g increases the global-max-block, which is
     # crucial for streaming writes. Without a higher global-max-block, streaming writes
     # would fall back to staged writes, resulting in slower write performance.
     # For gcsfuse v3.5.0 and later, `machine-type:a3-highgpu-8g` can be commented out.
     - machine-type:a3-highgpu-8g
-    - file-cache:max-size-mb:-1 
-    - file-cache:cache-file-for-range-read:true 
-    - file-cache:enable-parallel-downloads:true 
+    - file-cache:max-size-mb:-1
+    - file-cache:cache-file-for-range-read:true
+    - file-cache:enable-parallel-downloads:true
   {{- if $gcs.dirPath }}
-  - only-dir:{{ $gcs.dirPath }}
+    - only-dir:{{ $gcs.dirPath }}
+  {{- end }}
+  csi:
+    driver: gcsfuse.csi.storage.gke.io
+    volumeHandle: {{ $gcs.bucketName }}
+    volumeAttributes:
+      skipCSIBucketAccessCheck: "true"
+      gcsfuseMetadataPrefetchOnMount: "true"
+{{- else if eq $gcs.type "serving-model" }}
+  mountOptions:
+    - implicit-dirs
+    - metadata-cache:negative-ttl-secs:0
+    - metadata-cache:ttl-secs:-1
+    - metadata-cache:stat-cache-max-size-mb:-1
+    - metadata-cache:type-cache-max-size-mb:-1
+    - read_ahead_kb=1024
+    - file-cache:max-size-mb:0
+    - file-cache:enable-parallel-downloads:false
+  {{- if $gcs.dirPath }}
+    - only-dir:{{ $gcs.dirPath }}
   {{- end }}
   csi:
     driver: gcsfuse.csi.storage.gke.io
@@ -72,4 +92,5 @@ spec:
       gcsfuseMetadataPrefetchOnMount: "true"
 {{- end }}
 ---
-{{- end }}
\ No newline at end of file
+{{- end }}
+{{- end }}
diff --git a/src/helm-charts/storage/gcs-fuse/templates/pvc.yaml b/src/helm-charts/storage/gcs-fuse/templates/pvc.yaml
index 1e23afb..b414ca4 100644
--- a/src/helm-charts/storage/gcs-fuse/templates/pvc.yaml
+++ b/src/helm-charts/storage/gcs-fuse/templates/pvc.yaml
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 {{- range $gcs := .Values.gcsVolumes }}
+{{- if $gcs.bucketName }}
 apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
@@ -27,4 +28,5 @@ spec:
   storageClassName: gcs-fuse-storage
   volumeName: {{ $gcs.name }}-pv
 ---
-{{- end }}
\ No newline at end of file
+{{- end }}
+{{- end }}
diff --git a/src/helm-charts/storage/gcs-fuse/values.yaml b/src/helm-charts/storage/gcs-fuse/values.yaml
index ba6fcd5..7042c5d 100644
--- a/src/helm-charts/storage/gcs-fuse/values.yaml
+++ b/src/helm-charts/storage/gcs-fuse/values.yaml
@@ -22,4 +22,7 @@ gcsVolumes:
     bucketName:
   - name: gcs-checkpoints
     type: checkpoints
-    bucketName:
\ No newline at end of file
+    bucketName:
+  - name: gcs-serving-model
+    type: serving-model
+    bucketName:
diff --git a/src/launchers/trtllm-launcher-gcs.sh b/src/launchers/trtllm-launcher-gcs.sh
new file mode 100644
index 0000000..4558da1
--- /dev/null
+++ b/src/launchers/trtllm-launcher-gcs.sh
@@ -0,0 +1,211 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/bin/bash
+
+# Exit immediately if a command exits with a non-zero status.
+set -eux
+
+echo "TensorRT-LLM benchmark arguments received:"
+echo "  $@"
+echo ""
+
+# Function to validate model name
+validate_model_name() {
+    if [ -z "$MODEL_NAME" ]; then
+        echo "Error: MODEL_NAME environment variable is not set."
+        exit 1
+    fi
+    echo "Using MODEL_NAME: $MODEL_NAME"
+}
+
+# Function to parse arguments
+parse_arguments() {
+    model_name=$MODEL_NAME
+    isl=128
+    osl=128
+    num_requests=30000
+
+    # Parse known arguments and check for unknown option or missing argument
+    PARSED_OPTIONS=$(getopt -o "" -l model_name:,isl:,osl:,num_requests: -- "$@")
+    if [ $? -ne 0 ]; then
+        echo "Error: Failed to parse arguments. Check for invalid options or missing values."
+        exit 1
+    fi
+
+    # set the shell's positional parameters
+    eval set -- "$PARSED_OPTIONS"
+
+    while true; do
+        case "$1" in
+        --model_name)
+            model_name="$2"
+            shift 2
+            ;;
+        --isl)
+            isl="$2"
+            shift 2
+            ;;
+        --osl)
+            osl="$2"
+            shift 2
+            ;;
+        --num_requests)
+            num_requests="$2"
+            shift 2
+            ;;
+        --)
+            shift
+            break
+            ;;
+        *)
+            echo "Internal error: Argument parsing issue. Unexpected option: $1"
+            exit 1
+            ;;
+        esac
+    done
+
+    SERVING_CONFIG=("$@")
+}
+
+# Function to parse serving config
+parse_serving_config() {
+    declare -g -A SERVING_CONFIG_DICT
+
+    for ((index = 0; index < ${#SERVING_CONFIG[@]}; )); do
+        current_arg="${SERVING_CONFIG[$index]}"
+        next_arg="${SERVING_CONFIG[$((index + 1))]}"
+
+        # Handle --key=value format
+        if [[ "$current_arg" =~ ^--[^=]+=.+ ]]; then
+            key=$(echo "$current_arg" | cut -d'=' -f1 | sed 's/--//')
+            value=$(echo "$current_arg" | cut -d'=' -f2-)
+            SERVING_CONFIG_DICT["$key"]="$value"
+            ((index++))
+        # Handle --key value format
+        elif [[ "$current_arg" =~ ^--[^=]+$ && -n "$next_arg" && ! "$next_arg" =~ ^-- ]]; then
+            # Check if:
+            # 1. Current arg starts with -- and has no '=' (e.g., --key)
+            # 2. There IS a next argument (`-n "$next_arg"`)
+            # 3. The next argument does NOT start with -- (meaning it's a value, not another option)
+            key=$(echo "$current_arg" | sed 's/--//')
+            value="$next_arg"
+            SERVING_CONFIG_DICT["$key"]="$value"
+            ((index += 2))
+        # Handle --flag (boolean flag without a value)
+        elif [[ "$current_arg" =~ ^--[^=]+$ ]]; then
+            # If the key was pre-defined with a default, this will overwrite it to 'true'.
+            # If not pre-defined, it will create it.
+            key=$(echo "$current_arg" | sed 's/--//')
+            SERVING_CONFIG_DICT["$key"]="true"
+            ((index++))
+        else
+            ((index++))
+        fi
+    done
+
+    tp_size=${SERVING_CONFIG_DICT["tp_size"]:=8}
+    pp_size=${SERVING_CONFIG_DICT["pp_size"]:=1}
+    ep_size=${SERVING_CONFIG_DICT["ep_size"]:=1}
+    backend=${SERVING_CONFIG_DICT["backend"]:="tensorrt"}
+    kv_cache_free_gpu_mem_fraction=${SERVING_CONFIG_DICT["kv_cache_free_gpu_mem_fraction"]:=0.95}
+}
+
+print_configuration() {
+    echo "TensorRT-LLM benchmark arguments received:"
+    echo "  $@"
+    echo ""
+    echo "--------------------------------"
+    echo "--- Parsed Arguments Summary ---"
+    echo "model name:              $model_name"
+    echo "input seq length:        $isl"
+    echo "output seq length:       $osl"
+    echo "number of requests:      $num_requests"
+    echo "tensor parallel size:    $tp_size"
+    echo "pipeline parallel size:  $pp_size"
+    echo "expert parallel size:    $ep_size"
+    echo "backend:                 $backend"
+    echo "kv_cache_free_gpu_mem_fraction: $kv_cache_free_gpu_mem_fraction"
+    echo "--------------------------------"
+}
+
+# Function to run benchmarks
+run_benchmark() {
+    local model_name=$1
+    local isl=$2
+    local osl=$3
+    local num_requests=$4
+    local tp_size=$5
+    local pp_size=$6
+    local ep_size=$7
+    local backend=$8
+    local kv_cache_free_gpu_mem_fraction=$9
+
+    echo "Running benchmark for $model_name with ISL=$isl, OSL=$osl, TP=$tp_size, PP=$pp_size, EP=$ep_size, backend=$7"
+
+    dataset_file="/ssd/token-norm-dist_${model_name##*/}_${isl}_${osl}_tp${tp_size}.json"
+    output_file="/ssd/output_${model_name##*/}_isl${isl}_osl${osl}_tp${tp_size}.txt"
+    extra_args_file="/tmp/extra_llm_api_args.yaml"
+    extra_args=""
+    if [ -f "$extra_args_file" ]; then
+        extra_args="--extra_llm_api_options $extra_args_file"
+    fi
+
+    exec > >(tee $output_file) 2>&1
+
+    echo "Preparing dataset"
+    python3 $TRTLLM_DIR/benchmarks/cpp/prepare_dataset.py \
+        --tokenizer /serving-model \
+        --stdout token-norm-dist \
+        --num-requests=$num_requests \
+        --input-mean=$isl \
+        --output-mean=$osl \
+        --input-stdev=0 \
+        --output-stdev=0 >$dataset_file
+
+    echo "Running throughput benchmark"
+    trtllm-bench \
+    --model $model_name \
+    --model_path /serving-model throughput \
+    --dataset $dataset_file \
+    --tp $tp_size \
+    --pp $pp_size \
+    --ep $ep_size \
+    --backend "pytorch" \
+    --kv_cache_free_gpu_mem_fraction $kv_cache_free_gpu_mem_fraction $extra_args
+
+    cp $output_file /gcs-logs/benchmark_logs/trtllm/
+
+    rm -f $dataset_file
+}
+
+# Main function to run the benchmark
+main() {
+    # parse arguments
+    validate_model_name
+    parse_arguments "$@"
+    parse_serving_config
+    print_configuration "$@"
+
+    # run benchmark
+    mkdir -p /gcs-logs/benchmark_logs/trtllm
+    echo "Running benchmarks"
+    run_benchmark "$model_name" $isl $osl $num_requests $tp_size $pp_size $ep_size $backend $kv_cache_free_gpu_mem_fraction
+}
+
+# Set environment variables
+export LD_LIBRARY_PATH=/usr/local/lib/python3.12/dist-packages/torch/lib:/usr/local/lib/python3.12/dist-packages/torch_tensorrt/lib:/usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/usr/local/tensorrt/lib
+
+# Run the main function
+main "$@"

From d74e9184e7466233c1bea4f4696c6e02ea438ce0 Mon Sep 17 00:00:00 2001
From: lepan-google <lepan@google.com>
Date: Wed, 3 Dec 2025 08:29:27 +0000
Subject: [PATCH 02/10] Fix readme

---
 .../tensorrt-llm-gcs/README.md                | 76 +++++++++----------
 1 file changed, 36 insertions(+), 40 deletions(-)

diff --git a/inference/a4x/single-host-serving/tensorrt-llm-gcs/README.md b/inference/a4x/single-host-serving/tensorrt-llm-gcs/README.md
index 97ff76e..67bb4a4 100644
--- a/inference/a4x/single-host-serving/tensorrt-llm-gcs/README.md
+++ b/inference/a4x/single-host-serving/tensorrt-llm-gcs/README.md
@@ -1,14 +1,13 @@
-# Single Host Model Serving with NVIDIA TensorRT-LLM (TRT-LLM) on A4x GKE Node Pool using Google Cloud Storage
+# Single Host Model Serving with NVIDIA TensorRT-LLM (TRT-LLM) and Google Cloud Storage on A4x GKE Node Pool
 
-This document outlines the steps to serve and benchmark various Large Language Models (LLMs) using the [NVIDIA TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) framework on a single [A4x GKE Node pool](https://cloud.google.com/kubernetes-engine)
-and using Google Cloud Storage to hold the model folder.
+This document outlines the steps to serve and benchmark various Large Language Models (LLMs) using the [NVIDIA TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) framework on a single [A4x GKE Node pool](https://cloud.google.com/kubernetes-engine), with model stored in Google Cloud Storage.
 
 This guide walks you through setting up the necessary cloud infrastructure, configuring your environment, and deploying a high-performance LLM for inference.
 
 <a name="table-of-contents"></a>
 ## Table of Contents
 
-- [Single Host Model Serving with NVIDIA TensorRT-LLM (TRT-LLM) on A4x GKE Node Pool using Google Cloud Storage](#single-host-model-serving-with-nvidia-tensorrt-llm-trt-llm-on-a4x-gke-node-pool-using-google-cloud-storage)
+- [Single Host Model Serving with NVIDIA TensorRT-LLM (TRT-LLM) and Google Cloud Storage on A4x GKE Node Pool](#single-host-model-serving-with-nvidia-tensorrt-llm-trt-llm-and-google-cloud-storage-on-a4x-gke-node-pool)
   - [Table of Contents](#table-of-contents)
   - [1. Test Environment](#1-test-environment)
   - [2. High-Level Flow](#2-high-level-flow)
@@ -16,7 +15,7 @@ This guide walks you through setting up the necessary cloud infrastructure, conf
     - [3.1. Clone the Repository](#31-clone-the-repository)
     - [3.2. Configure Environment Variables](#32-configure-environment-variables)
     - [3.3. Connect to your GKE Cluster](#33-connect-to-your-gke-cluster)
-    - [3.4 Upload the model checkpoints](#34-upload-the-model-checkpoints)
+    - [3.4 Upload the Model Checkpoints](#34-upload-the-model-checkpoints)
     - [3.5 Create Persistent Volumes and Persistent Volume Claims](#35-create-persistent-volumes-and-persistent-volume-claims)
     - [3.6 Grant Storage Permission to Kubernetes Service Account](#36-grant-storage-permission-to-kubernetes-service-account)
   - [4. Run the recipe](#4-run-the-recipe)
@@ -34,8 +33,7 @@ This guide walks you through setting up the necessary cloud infrastructure, conf
 The recipe uses the following setup:
 
 * **Orchestration**: [Google Kubernetes Engine (GKE)](https://cloud.google.com/kubernetes-engine)
-* **Deployment Configuration**: A [Helm chart](https://helm.sh/) is used to configure and deploy a [Kubernetes Deployment](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/). This deployment encapsulates the inference of the target LLM using the TensorRT-LLM framework.
-  Another Helm chart is used to create persistent volume and persistent volume claim for model folder in GCS.
+* **Deployment Configuration**: A [Helm chart](https://helm.sh/) is used to configure and deploy a [Kubernetes Deployment](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/). This deployment encapsulates the inference of the target LLM using the TensorRT-LLM framework.  A separate Helm chart creates the necessary Persistent Volume (PV) and Persistent Volume Claim (PVC), facilitating access to the model stored in Google Cloud Storage (GCS).
 
 This recipe has been optimized for and tested with the following configuration:
 
@@ -128,24 +126,15 @@ export RECIPE_ROOT=$REPO_ROOT/inference/a4x/single-host-serving/tensorrt-llm-gcs
 This is the most critical step. These variables are used in subsequent commands to target the correct resources.
 
 ```bash
-export PROJECT_ID=<PROJECT_ID>
-export CLUSTER_REGION=<CLUSTER_REGION>
-export CLUSTER_NAME=<CLUSTER_NAME>
-export KUEUE_NAME=<KUEUE_NAME>
-export GCS_BUCKET_LOGS=<GCS_BUCKET_LOGS>
-export GCS_BUCKET_SERVING_MODEL=<GCS_BUCKET_SERVING_MODEL>
+export PROJECT_ID=<YOUR_PROJECT_ID>
+export CLUSTER_REGION=<YOUR_CLUSTER_REGION>
+export CLUSTER_NAME=<YOUR_CLUSTER_NAME>
+export KUEUE_NAME=<YOUR_KUEUE_NAME>
+export GCS_BUCKET_LOGS=<YOUR_GCS_BUCKET_LOGS>
+export GCS_BUCKET_SERVING_MODEL=<YOUR_GCS_BUCKET_SERVING_MODEL>
 export GCS_FOLDER_SERVING_MODEL=<YOUR_GCS_FOLDER_SERVING_MODEL>
 export TRTLLM_VERSION=1.2.0rc2
 
-export PROJECT_ID=supercomputer-testing
-export CLUSTER_REGION=us-west8
-export CLUSTER_NAME=imo-glacier-peak
-export KUEUE_NAME=a4x
-export GCS_BUCKET_LOGS=tess-benchmark-outputs
-export GCS_BUCKET_SERVING_MODEL=serving-model-us-west8
-export GCS_FOLDER_SERVING_MODEL=cp-hf
-export TRTLLM_VERSION=1.2.0rc2
-
 # Set the project for gcloud commands
 gcloud config set project $PROJECT_ID
 ```
@@ -160,7 +149,7 @@ Replace the following values:
 | `KUEUE_NAME` | The name of the Kueue local queue. The default queue created by the cluster toolkit is `a4x`. Verify the name in your cluster. | `a4x` |
 | `GCS_BUCKET_LOGS` | Name of your GCS logs bucket (do not include `gs://`). | `my-benchmark-logs-bucket` |
 | `GCS_BUCKET_SERVING_MODEL` | Name of your GCS model bucket (do not include `gs://`). | `my-benchmark-model-bucket` |
-| `GCS_FOLDER_SERVING_MODEL` | Name of your GCS model folder (do not include `gs://{your-model-bucket}`). | `my-benchmark-model-bucket` |
+| `GCS_FOLDER_SERVING_MODEL` | Name of your GCS model folder (do not include `gs://{your-model-bucket}/`). | `my-benchmark-model-folder` |
 | `TRTLLM_VERSION` | The tag/version for the Docker image. Other verions can be found at https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release | `1.2.0rc2` |
 
 
@@ -173,16 +162,24 @@ Fetch credentials for `kubectl` to communicate with your cluster.
 gcloud container clusters get-credentials $CLUSTER_NAME --region $CLUSTER_REGION
 ```
 
-
 <a name="download-model-checkpoints"></a>
-### 3.4 Upload the model checkpoints
+### 3.4 Upload the Model Checkpoints
+
 In this recipe, we are using [DeepSeek-R1 671B model](https://huggingface.co/deepseek-ai/DeepSeek-R1) on HuggingFace.
-To download the model:
-- [mount the bucket](https://docs.cloud.google.com/storage/docs/cloud-storage-fuse/mount-bucket)
+To download the model, please follow the steps below:
+1. [mount the bucket](https://docs.cloud.google.com/storage/docs/cloud-storage-fuse/mount-bucket)
 to your local system.
-- Access into the mount path, create a folder to hold model.
-- [Download](https://huggingface.co/docs/hub/en/models-downloading)
-the model through `hf command`
+2. Access into the mount point, create the model folder.
+
+Under the mount point:
+
+```bash
+3. [Download](https://huggingface.co/docs/hub/en/models-downloading)
+the model using `hf command`:
+
+```bash
+hf download deepseek-ai/DeepSeek-R1
+```
 
 <a name="create-pv-pvc"></a>
 ### 3.5 Create Persistent Volumes and Persistent Volume Claims
@@ -195,7 +192,7 @@ Claims (PVC). You must generate PVs and PVCs for serving bucket using the
 The chart configures the FUSE driver settings following the best practices
 for optimizing access to buckets for training data and checkpoints.
 
-```
+```bash
 helm install -f $REPO_ROOT/src/helm-charts/storage/gcs-fuse/values.yaml \
 --set gcsVolumes[2].bucketName=${GCS_BUCKET_SERVING_MODEL} \
 --set gcsVolumes[2].dirPath=${GCS_FOLDER_SERVING_MODEL} \
@@ -207,11 +204,11 @@ $REPO_ROOT/src/helm-charts/storage/gcs-fuse
 ### 3.6 Grant Storage Permission to Kubernetes Service Account
 
 For a cluster with
-[Workload Identity Federation](https://cloud.google.com/kubernetes-engine/docs/concepts/workload-identity)
-, you can grant `roles/storage.objectAdmin` access to Kubernetes service
-account following
-[instruction](https://cloud.google.com/kubernetes-engine/docs/concepts/workload-identity#kubernetes-resources-iam-policies).
-
+[Workload Identity Federation](https://cloud.google.com/kubernetes-engine/docs/concepts/workload-identity) enabled
+, please following
+[instructions](https://cloud.google.com/kubernetes-engine/docs/concepts/workload-identity#kubernetes-resources-iam-policies)
+to grant `roles/storage.objectAdmin` access to Kubernetes service
+account.
 
 <a name="run-the-recipe"></a>
 ## 4. Run the recipe
@@ -230,7 +227,7 @@ This recipe supports the deployment of the following models:
 
 [Back to Top](#table-of-contents)
 
-The recipe runs inference throughput benchmark for [DeepSeek-R1 671B NVFP4 model](https://huggingface.co/nvidia/DeepSeek-R1-NVFP4-v2) which is Nvidia's pre-quantized FP4 checkpoint of the original [DeepSeek-R1 671B model](https://huggingface.co/deepseek-ai/DeepSeek-R1).
+The recipe runs inference throughputs benchmark for [DeepSeek-R1 671B NVFP4 model](https://huggingface.co/nvidia/DeepSeek-R1-NVFP4-v2) which is Nvidia's pre-quantized FP4 checkpoint of the original [DeepSeek-R1 671B model](https://huggingface.co/deepseek-ai/DeepSeek-R1).
 
 The recipe uses [`trtllm-bench`](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/performance/perf-benchmarking.md), a command-line tool from NVIDIA to benchmark the performance of TensorRT-LLM engine. For more information about `trtllm-bench`, see the [TensorRT-LLM documentation](https://github.com/NVIDIA/TensorRT-LLM).
 
@@ -240,16 +237,15 @@ The recipe uses [`trtllm-bench`](https://github.com/NVIDIA/TensorRT-LLM/blob/mai
 1. Install the helm chart to prepare and benchmark the model using [`trtllm-bench`](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/performance/perf-benchmarking.md) tool:
 
     ```bash
-    --set queue=${KUEUE_NAME} \
-
     cd $RECIPE_ROOT
     helm install -f values.yaml \
     --set-file workload_launcher=$REPO_ROOT/src/launchers/trtllm-launcher-gcs.sh \
     --set-file serving_config=$REPO_ROOT/src/frameworks/a4x/trtllm-configs/deepseek-r1-nvfp4.yaml \
+    --set queue=${KUEUE_NAME} \
     --set "volumes.gcsMounts[0].bucketName=${GCS_BUCKET_LOGS}" \
     --set workload.model.name=nvidia/DeepSeek-R1-NVFP4-v2 \
     --set workload.image=nvcr.io/nvidia/tensorrt-llm/release:${TRTLLM_VERSION} \
-    $USER-serving-deepseek-r1-model-39 \
+    $USER-serving-deepseek-r1-model \
     $REPO_ROOT/src/helm-charts/a4x/inference-templates-gcs/deployment
     ```
 

From a7c2bcf093179a3594523f4705deb1f67576b7e0 Mon Sep 17 00:00:00 2001
From: lepan-google <lepan@google.com>
Date: Wed, 3 Dec 2025 09:06:13 +0000
Subject: [PATCH 03/10] Fix README

---
 .../tensorrt-llm-gcs/README.md                | 44 +++++++++----------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/inference/a4x/single-host-serving/tensorrt-llm-gcs/README.md b/inference/a4x/single-host-serving/tensorrt-llm-gcs/README.md
index 67bb4a4..7727e35 100644
--- a/inference/a4x/single-host-serving/tensorrt-llm-gcs/README.md
+++ b/inference/a4x/single-host-serving/tensorrt-llm-gcs/README.md
@@ -1,4 +1,4 @@
-# Single Host Model Serving with NVIDIA TensorRT-LLM (TRT-LLM) and Google Cloud Storage on A4x GKE Node Pool
+# Single Host Model Serving with NVIDIA TensorRT-LLM (TRT-LLM) and Google Cloud Storage on A4X GKE Node Pool
 
 This document outlines the steps to serve and benchmark various Large Language Models (LLMs) using the [NVIDIA TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) framework on a single [A4x GKE Node pool](https://cloud.google.com/kubernetes-engine), with model stored in Google Cloud Storage.
 
@@ -7,7 +7,7 @@ This guide walks you through setting up the necessary cloud infrastructure, conf
 <a name="table-of-contents"></a>
 ## Table of Contents
 
-- [Single Host Model Serving with NVIDIA TensorRT-LLM (TRT-LLM) and Google Cloud Storage on A4x GKE Node Pool](#single-host-model-serving-with-nvidia-tensorrt-llm-trt-llm-and-google-cloud-storage-on-a4x-gke-node-pool)
+- [Single Host Model Serving with NVIDIA TensorRT-LLM (TRT-LLM) and Google Cloud Storage on A4X GKE Node Pool](#single-host-model-serving-with-nvidia-tensorrt-llm-trt-llm-and-google-cloud-storage-on-a4x-gke-node-pool)
   - [Table of Contents](#table-of-contents)
   - [1. Test Environment](#1-test-environment)
   - [2. High-Level Flow](#2-high-level-flow)
@@ -33,7 +33,8 @@ This guide walks you through setting up the necessary cloud infrastructure, conf
 The recipe uses the following setup:
 
 * **Orchestration**: [Google Kubernetes Engine (GKE)](https://cloud.google.com/kubernetes-engine)
-* **Deployment Configuration**: A [Helm chart](https://helm.sh/) is used to configure and deploy a [Kubernetes Deployment](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/). This deployment encapsulates the inference of the target LLM using the TensorRT-LLM framework.  A separate Helm chart creates the necessary Persistent Volume (PV) and Persistent Volume Claim (PVC), facilitating access to the model stored in Google Cloud Storage (GCS).
+* **Deployment Configuration**: A [Helm chart](https://helm.sh/) is used to configure and deploy a [Kubernetes Deployment](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/). This deployment encapsulates the inference of the target LLM using the TensorRT-LLM framework.  A separate Helm chart creates the necessary Persistent Volume (PV) and Persistent Volume Claim (PVC), facilitating access to the model stored in
+  [Google Cloud Storage (GCS)](https://cloud.google.com/storage?hl=en).
 
 This recipe has been optimized for and tested with the following configuration:
 
@@ -89,7 +90,7 @@ flowchart TD
     C --> D
     D --> E
     F --> C
-    H -- Downloads at runtime --> E
+    H -- Load at runtime --> E
     E -- Write logs --> J
 
 
@@ -149,7 +150,7 @@ Replace the following values:
 | `KUEUE_NAME` | The name of the Kueue local queue. The default queue created by the cluster toolkit is `a4x`. Verify the name in your cluster. | `a4x` |
 | `GCS_BUCKET_LOGS` | Name of your GCS logs bucket (do not include `gs://`). | `my-benchmark-logs-bucket` |
 | `GCS_BUCKET_SERVING_MODEL` | Name of your GCS model bucket (do not include `gs://`). | `my-benchmark-model-bucket` |
-| `GCS_FOLDER_SERVING_MODEL` | Name of your GCS model folder (do not include `gs://{your-model-bucket}/`). | `my-benchmark-model-folder` |
+| `GCS_FOLDER_SERVING_MODEL` | Name of your GCS model folder (do not include `gs://{your-benchmark-model-bucket}/`). | `my-benchmark-model-folder` |
 | `TRTLLM_VERSION` | The tag/version for the Docker image. Other verions can be found at https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release | `1.2.0rc2` |
 
 
@@ -165,32 +166,29 @@ gcloud container clusters get-credentials $CLUSTER_NAME --region $CLUSTER_REGION
 <a name="download-model-checkpoints"></a>
 ### 3.4 Upload the Model Checkpoints
 
-In this recipe, we are using [DeepSeek-R1 671B model](https://huggingface.co/deepseek-ai/DeepSeek-R1) on HuggingFace.
-To download the model, please follow the steps below:
-1. [mount the bucket](https://docs.cloud.google.com/storage/docs/cloud-storage-fuse/mount-bucket)
-to your local system.
-2. Access into the mount point, create the model folder.
-
-Under the mount point:
+To download the model from HuggingFace, please follow the steps below:
 
-```bash
-3. [Download](https://huggingface.co/docs/hub/en/models-downloading)
-the model using `hf command`:
+1.  [Mount the bucket](https://docs.cloud.google.com/storage/docs/cloud-storage-fuse/mount-bucket)
+to your local system.
+1.  Access into the mount point and create the model folder.
+2.  Under the mount point,
+    [download](https://huggingface.co/docs/hub/en/models-downloading) the model
+    using the `hf` command:
 
-```bash
-hf download deepseek-ai/DeepSeek-R1
-```
+    ```bash
+    hf download {MODEL_NAME}
+    ```
 
 <a name="create-pv-pvc"></a>
 ### 3.5 Create Persistent Volumes and Persistent Volume Claims
 
-The inference deployment accesses GCS buckets for model through
+The inference deployment accesses GCS buckets for serving model through
 [the Cloud Storage FUSE CSI driver](https://cloud.google.com/kubernetes-engine/docs/how-to/persistent-volumes/cloud-storage-fuse-csi-driver)
 configured using Kubernetes Persistent Volumes (PV) and Persistent Volume
-Claims (PVC). You must generate PVs and PVCs for serving bucket using the
+Claims (PVC). You must generate PVs and PVCs for serving modelbucket using the
 [gcs-fuse helper Helm chart](../../../../src/helm-charts/storage/gcs-fuse).
 The chart configures the FUSE driver settings following the best practices
-for optimizing access to buckets for training data and checkpoints.
+for optimizing access to buckets for serving model.
 
 ```bash
 helm install -f $REPO_ROOT/src/helm-charts/storage/gcs-fuse/values.yaml \
@@ -249,7 +247,7 @@ The recipe uses [`trtllm-bench`](https://github.com/NVIDIA/TensorRT-LLM/blob/mai
     $REPO_ROOT/src/helm-charts/a4x/inference-templates-gcs/deployment
     ```
 
-  This creates a Helm release and a Deployment named `$USER-serving-deepseek-r1-model`, and a Service named `$USER-serving-deepseek-r1-model-svc`.
+    This creates a Helm release and a Deployment named `$USER-serving-deepseek-r1-model`, and a Service named `$USER-serving-deepseek-r1-model-svc`.
 
 2.  **Check the deployment status.**
 
@@ -373,6 +371,8 @@ P99:   128.0000           128.0000           256.0000
 <a name="cleanup"></a>
 ## 6. Cleanup
 
+[Back to Top](#table-of-contents)
+
 To avoid incurring further charges, clean up the resources you created.
 
 1.  **Uninstall the Helm Release:**

From 2714d00a256df695c827f380b7978364a2284351 Mon Sep 17 00:00:00 2001
From: lepan-google <lepan@google.com>
Date: Fri, 5 Dec 2025 20:50:17 +0000
Subject: [PATCH 04/10] Resolve comments

---
 .../tensorrt-llm-gcs/README.md                | 38 +++++++++----------
 .../tensorrt-llm-gcs/values.yaml              | 19 ----------
 2 files changed, 18 insertions(+), 39 deletions(-)

diff --git a/inference/a4x/single-host-serving/tensorrt-llm-gcs/README.md b/inference/a4x/single-host-serving/tensorrt-llm-gcs/README.md
index 7727e35..dc00c11 100644
--- a/inference/a4x/single-host-serving/tensorrt-llm-gcs/README.md
+++ b/inference/a4x/single-host-serving/tensorrt-llm-gcs/README.md
@@ -1,29 +1,27 @@
 # Single Host Model Serving with NVIDIA TensorRT-LLM (TRT-LLM) and Google Cloud Storage on A4X GKE Node Pool
 
-This document outlines the steps to serve and benchmark various Large Language Models (LLMs) using the [NVIDIA TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) framework on a single [A4x GKE Node pool](https://cloud.google.com/kubernetes-engine), with model stored in Google Cloud Storage.
+This document outlines the steps to serve and benchmark various Large Language Models (LLMs) using the [NVIDIA TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) framework on a single [A4X GKE Node pool](https://cloud.google.com/kubernetes-engine), with model stored in Google Cloud Storage.
 
 This guide walks you through setting up the necessary cloud infrastructure, configuring your environment, and deploying a high-performance LLM for inference.
 
 <a name="table-of-contents"></a>
 ## Table of Contents
 
-- [Single Host Model Serving with NVIDIA TensorRT-LLM (TRT-LLM) and Google Cloud Storage on A4X GKE Node Pool](#single-host-model-serving-with-nvidia-tensorrt-llm-trt-llm-and-google-cloud-storage-on-a4x-gke-node-pool)
-  - [Table of Contents](#table-of-contents)
-  - [1. Test Environment](#1-test-environment)
-  - [2. High-Level Flow](#2-high-level-flow)
-  - [3. Environment Setup (One-Time)](#3-environment-setup-one-time)
-    - [3.1. Clone the Repository](#31-clone-the-repository)
-    - [3.2. Configure Environment Variables](#32-configure-environment-variables)
-    - [3.3. Connect to your GKE Cluster](#33-connect-to-your-gke-cluster)
-    - [3.4 Upload the Model Checkpoints](#34-upload-the-model-checkpoints)
-    - [3.5 Create Persistent Volumes and Persistent Volume Claims](#35-create-persistent-volumes-and-persistent-volume-claims)
-    - [3.6 Grant Storage Permission to Kubernetes Service Account](#36-grant-storage-permission-to-kubernetes-service-account)
-  - [4. Run the recipe](#4-run-the-recipe)
-    - [4.1. Inference benchmark for DeepSeek-R1 671B Model](#41-inference-benchmark-for-deepseek-r1-671b-model)
-  - [5. Monitoring and Troubleshooting](#5-monitoring-and-troubleshooting)
-    - [5.1. Check Deployment Status](#51-check-deployment-status)
-    - [5.2. View Logs](#52-view-logs)
-  - [6. Cleanup](#6-cleanup)
+* [1. Test Environment](#test-environment)
+* [2. High-Level Architecture](#architecture)
+* [3. Environment Setup (One-Time)](#environment-setup)
+  * [3.1. Clone the Repository](#clone-repo)
+  * [3.2. Configure Environment Variables](#configure-vars)
+  * [3.3. Connect to your GKE Cluster](#connect-cluster)
+  * [3.4 Upload the model checkpoints](#upload-the-model-checkpoints)
+  * [3.5 Create Persistent Volumes and Persistent Volume Claims](#create-persistent-volumes-and-persistent-volume-claims)
+  * [3.6 Grant Storage Permissions to Kubernetes Service Account](#grant-storage-permission-to-kubernetes-service-account)
+* [4. Run the Recipe](#run-the-recipe)
+  * [4.1. Inference benchmark for DeepSeek-R1 671B](#serving-deepseek-r1-671b)
+* [5. Monitoring and Troubleshooting](#monitoring)
+  * [5.1. Check Deployment Status](#check-status)
+  * [5.2. View Logs](#view-logs)
+* [6. Cleanup](#cleanup)
 
 <a name="test-environment"></a>
 ## 1. Test Environment
@@ -72,7 +70,7 @@ flowchart TD
     B("Kubernetes API")
     A["helm install"]
   end
- subgraph gke["GKE Cluster (A4x)"]
+ subgraph gke["GKE Cluster (A4X)"]
     C["Deployment"]
     D["Pod"]
     E["TensorRT-LLM container"]
@@ -204,7 +202,7 @@ $REPO_ROOT/src/helm-charts/storage/gcs-fuse
 For a cluster with
 [Workload Identity Federation](https://cloud.google.com/kubernetes-engine/docs/concepts/workload-identity) enabled
 , please following
-[instructions](https://cloud.google.com/kubernetes-engine/docs/concepts/workload-identity#kubernetes-resources-iam-policies)
+[these instructions](https://cloud.google.com/kubernetes-engine/docs/concepts/workload-identity#kubernetes-resources-iam-policies)
 to grant `roles/storage.objectAdmin` access to Kubernetes service
 account.
 
diff --git a/inference/a4x/single-host-serving/tensorrt-llm-gcs/values.yaml b/inference/a4x/single-host-serving/tensorrt-llm-gcs/values.yaml
index 47d0d59..c679e06 100644
--- a/inference/a4x/single-host-serving/tensorrt-llm-gcs/values.yaml
+++ b/inference/a4x/single-host-serving/tensorrt-llm-gcs/values.yaml
@@ -57,22 +57,3 @@ network:
     - name: NCCL_DEBUG
       value: "VERSION"
 
-targetNodes:
-- gke-imo-glacier-peak-a4x-highgpu-4g-a-76a6f770-0phl
-- gke-imo-glacier-peak-a4x-highgpu-4g-a-76a6f770-12dg
-- gke-imo-glacier-peak-a4x-highgpu-4g-a-76a6f770-4ncf
-- gke-imo-glacier-peak-a4x-highgpu-4g-a-76a6f770-6t1h
-- gke-imo-glacier-peak-a4x-highgpu-4g-a-76a6f770-7knw
-- gke-imo-glacier-peak-a4x-highgpu-4g-a-76a6f770-8z5k
-- gke-imo-glacier-peak-a4x-highgpu-4g-a-76a6f770-jt1t
-- gke-imo-glacier-peak-a4x-highgpu-4g-a-76a6f770-km6k
-- gke-imo-glacier-peak-a4x-highgpu-4g-a-76a6f770-lr4w
-- gke-imo-glacier-peak-a4x-highgpu-4g-a-76a6f770-lzj8
-- gke-imo-glacier-peak-a4x-highgpu-4g-a-76a6f770-nghr
-- gke-imo-glacier-peak-a4x-highgpu-4g-a-76a6f770-q1rz
-- gke-imo-glacier-peak-a4x-highgpu-4g-a-76a6f770-rvlb
-- gke-imo-glacier-peak-a4x-highgpu-4g-a-76a6f770-sjcg
-- gke-imo-glacier-peak-a4x-highgpu-4g-a-76a6f770-v549
-- gke-imo-glacier-peak-a4x-highgpu-4g-a-76a6f770-vd4h
-- gke-imo-glacier-peak-a4x-highgpu-4g-a-76a6f770-x1qx
-- gke-imo-glacier-peak-a4x-highgpu-4g-a-76a6f770-z47k

From 8e1bd1c31eac9674e4c2afd039ee494c2033e396 Mon Sep 17 00:00:00 2001
From: lepan-google <lepan@google.com>
Date: Fri, 5 Dec 2025 20:54:16 +0000
Subject: [PATCH 05/10] Format the content table

---
 .../tensorrt-llm-gcs/README.md                | 32 ++++++++++---------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/inference/a4x/single-host-serving/tensorrt-llm-gcs/README.md b/inference/a4x/single-host-serving/tensorrt-llm-gcs/README.md
index dc00c11..3fa80e1 100644
--- a/inference/a4x/single-host-serving/tensorrt-llm-gcs/README.md
+++ b/inference/a4x/single-host-serving/tensorrt-llm-gcs/README.md
@@ -7,21 +7,23 @@ This guide walks you through setting up the necessary cloud infrastructure, conf
 <a name="table-of-contents"></a>
 ## Table of Contents
 
-* [1. Test Environment](#test-environment)
-* [2. High-Level Architecture](#architecture)
-* [3. Environment Setup (One-Time)](#environment-setup)
-  * [3.1. Clone the Repository](#clone-repo)
-  * [3.2. Configure Environment Variables](#configure-vars)
-  * [3.3. Connect to your GKE Cluster](#connect-cluster)
-  * [3.4 Upload the model checkpoints](#upload-the-model-checkpoints)
-  * [3.5 Create Persistent Volumes and Persistent Volume Claims](#create-persistent-volumes-and-persistent-volume-claims)
-  * [3.6 Grant Storage Permissions to Kubernetes Service Account](#grant-storage-permission-to-kubernetes-service-account)
-* [4. Run the Recipe](#run-the-recipe)
-  * [4.1. Inference benchmark for DeepSeek-R1 671B](#serving-deepseek-r1-671b)
-* [5. Monitoring and Troubleshooting](#monitoring)
-  * [5.1. Check Deployment Status](#check-status)
-  * [5.2. View Logs](#view-logs)
-* [6. Cleanup](#cleanup)
+- [Single Host Model Serving with NVIDIA TensorRT-LLM (TRT-LLM) and Google Cloud Storage on A4X GKE Node Pool](#single-host-model-serving-with-nvidia-tensorrt-llm-trt-llm-and-google-cloud-storage-on-a4x-gke-node-pool)
+  - [Table of Contents](#table-of-contents)
+  - [1. Test Environment](#1-test-environment)
+  - [2. High-Level Flow](#2-high-level-flow)
+  - [3. Environment Setup (One-Time)](#3-environment-setup-one-time)
+    - [3.1. Clone the Repository](#31-clone-the-repository)
+    - [3.2. Configure Environment Variables](#32-configure-environment-variables)
+    - [3.3. Connect to your GKE Cluster](#33-connect-to-your-gke-cluster)
+    - [3.4 Upload the Model Checkpoints](#34-upload-the-model-checkpoints)
+    - [3.5 Create Persistent Volumes and Persistent Volume Claims](#35-create-persistent-volumes-and-persistent-volume-claims)
+    - [3.6 Grant Storage Permission to Kubernetes Service Account](#36-grant-storage-permission-to-kubernetes-service-account)
+  - [4. Run the recipe](#4-run-the-recipe)
+    - [4.1. Inference benchmark for DeepSeek-R1 671B Model](#41-inference-benchmark-for-deepseek-r1-671b-model)
+  - [5. Monitoring and Troubleshooting](#5-monitoring-and-troubleshooting)
+    - [5.1. Check Deployment Status](#51-check-deployment-status)
+    - [5.2. View Logs](#52-view-logs)
+  - [6. Cleanup](#6-cleanup)
 
 <a name="test-environment"></a>
 ## 1. Test Environment

From 96e289e2217d902f327b0b58279412683da22c02 Mon Sep 17 00:00:00 2001
From: lepan-google <lepan@google.com>
Date: Fri, 5 Dec 2025 20:56:27 +0000
Subject: [PATCH 06/10] Format content tables

---
 .../tensorrt-llm-gcs/README.md                | 32 +++++++++----------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/inference/a4x/single-host-serving/tensorrt-llm-gcs/README.md b/inference/a4x/single-host-serving/tensorrt-llm-gcs/README.md
index 3fa80e1..110891f 100644
--- a/inference/a4x/single-host-serving/tensorrt-llm-gcs/README.md
+++ b/inference/a4x/single-host-serving/tensorrt-llm-gcs/README.md
@@ -7,23 +7,21 @@ This guide walks you through setting up the necessary cloud infrastructure, conf
 <a name="table-of-contents"></a>
 ## Table of Contents
 
-- [Single Host Model Serving with NVIDIA TensorRT-LLM (TRT-LLM) and Google Cloud Storage on A4X GKE Node Pool](#single-host-model-serving-with-nvidia-tensorrt-llm-trt-llm-and-google-cloud-storage-on-a4x-gke-node-pool)
-  - [Table of Contents](#table-of-contents)
-  - [1. Test Environment](#1-test-environment)
-  - [2. High-Level Flow](#2-high-level-flow)
-  - [3. Environment Setup (One-Time)](#3-environment-setup-one-time)
-    - [3.1. Clone the Repository](#31-clone-the-repository)
-    - [3.2. Configure Environment Variables](#32-configure-environment-variables)
-    - [3.3. Connect to your GKE Cluster](#33-connect-to-your-gke-cluster)
-    - [3.4 Upload the Model Checkpoints](#34-upload-the-model-checkpoints)
-    - [3.5 Create Persistent Volumes and Persistent Volume Claims](#35-create-persistent-volumes-and-persistent-volume-claims)
-    - [3.6 Grant Storage Permission to Kubernetes Service Account](#36-grant-storage-permission-to-kubernetes-service-account)
-  - [4. Run the recipe](#4-run-the-recipe)
-    - [4.1. Inference benchmark for DeepSeek-R1 671B Model](#41-inference-benchmark-for-deepseek-r1-671b-model)
-  - [5. Monitoring and Troubleshooting](#5-monitoring-and-troubleshooting)
-    - [5.1. Check Deployment Status](#51-check-deployment-status)
-    - [5.2. View Logs](#52-view-logs)
-  - [6. Cleanup](#6-cleanup)
+* [1. Test Environment](#test-environment)
+* [2. High-Level Architecture](#architecture)
+* [3. Environment Setup (One-Time)](#environment-setup)
+  * [3.1. Clone the Repository](#clone-repo)
+  * [3.2. Configure Environment Variables](#configure-vars)
+  * [3.3. Connect to your GKE Cluster](#connect-cluster)
+  * [3.4. Upload the Model Checkpoints](#upload-the-model-checkpoints)
+  * [3.5. Create Persistent Volumes and Persistent Volume Claims](#create-persistent-volumes-and-persistent-volume-claims)
+  * [3.6. Grant Storage Permissions to Kubernetes Service Account](#grant-storage-permission-to-kubernetes-service-account)
+* [4. Run the Recipe](#run-the-recipe)
+  * [4.1. Inference benchmark for DeepSeek-R1 671B](#serving-deepseek-r1-671b)
+* [5. Monitoring and Troubleshooting](#monitoring)
+  * [5.1. Check Deployment Status](#check-status)
+  * [5.2. View Logs](#view-logs)
+* [6. Cleanup](#cleanup)
 
 <a name="test-environment"></a>
 ## 1. Test Environment

From d0a7d9bf1b2c5491bd91df72b2112cd470815d69 Mon Sep 17 00:00:00 2001
From: lepan-google <lepan@google.com>
Date: Fri, 5 Dec 2025 23:30:21 +0000
Subject: [PATCH 07/10] Correct grammar issue in README

---
 .../tensorrt-llm-gcs/README.md                | 34 ++++++++++---------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/inference/a4x/single-host-serving/tensorrt-llm-gcs/README.md b/inference/a4x/single-host-serving/tensorrt-llm-gcs/README.md
index 110891f..04e6aa8 100644
--- a/inference/a4x/single-host-serving/tensorrt-llm-gcs/README.md
+++ b/inference/a4x/single-host-serving/tensorrt-llm-gcs/README.md
@@ -7,21 +7,23 @@ This guide walks you through setting up the necessary cloud infrastructure, conf
 <a name="table-of-contents"></a>
 ## Table of Contents
 
-* [1. Test Environment](#test-environment)
-* [2. High-Level Architecture](#architecture)
-* [3. Environment Setup (One-Time)](#environment-setup)
-  * [3.1. Clone the Repository](#clone-repo)
-  * [3.2. Configure Environment Variables](#configure-vars)
-  * [3.3. Connect to your GKE Cluster](#connect-cluster)
-  * [3.4. Upload the Model Checkpoints](#upload-the-model-checkpoints)
-  * [3.5. Create Persistent Volumes and Persistent Volume Claims](#create-persistent-volumes-and-persistent-volume-claims)
-  * [3.6. Grant Storage Permissions to Kubernetes Service Account](#grant-storage-permission-to-kubernetes-service-account)
-* [4. Run the Recipe](#run-the-recipe)
-  * [4.1. Inference benchmark for DeepSeek-R1 671B](#serving-deepseek-r1-671b)
-* [5. Monitoring and Troubleshooting](#monitoring)
-  * [5.1. Check Deployment Status](#check-status)
-  * [5.2. View Logs](#view-logs)
-* [6. Cleanup](#cleanup)
+- [Single Host Model Serving with NVIDIA TensorRT-LLM (TRT-LLM) and Google Cloud Storage on A4X GKE Node Pool](#single-host-model-serving-with-nvidia-tensorrt-llm-trt-llm-and-google-cloud-storage-on-a4x-gke-node-pool)
+  - [Table of Contents](#table-of-contents)
+  - [1. Test Environment](#1-test-environment)
+  - [2. High-Level Flow](#2-high-level-flow)
+  - [3. Environment Setup (One-Time)](#3-environment-setup-one-time)
+    - [3.1. Clone the Repository](#31-clone-the-repository)
+    - [3.2. Configure Environment Variables](#32-configure-environment-variables)
+    - [3.3. Connect to your GKE Cluster](#33-connect-to-your-gke-cluster)
+    - [3.4 Upload the Model Checkpoints](#34-upload-the-model-checkpoints)
+    - [3.5 Create Persistent Volumes and Persistent Volume Claims](#35-create-persistent-volumes-and-persistent-volume-claims)
+    - [3.6 Grant Storage Permission to Kubernetes Service Account](#36-grant-storage-permission-to-kubernetes-service-account)
+  - [4. Run the recipe](#4-run-the-recipe)
+    - [4.1. Inference benchmark for DeepSeek-R1 671B Model](#41-inference-benchmark-for-deepseek-r1-671b-model)
+  - [5. Monitoring and Troubleshooting](#5-monitoring-and-troubleshooting)
+    - [5.1. Check Deployment Status](#51-check-deployment-status)
+    - [5.2. View Logs](#52-view-logs)
+  - [6. Cleanup](#6-cleanup)
 
 <a name="test-environment"></a>
 ## 1. Test Environment
@@ -201,7 +203,7 @@ $REPO_ROOT/src/helm-charts/storage/gcs-fuse
 
 For a cluster with
 [Workload Identity Federation](https://cloud.google.com/kubernetes-engine/docs/concepts/workload-identity) enabled
-, please following
+, please follow
 [these instructions](https://cloud.google.com/kubernetes-engine/docs/concepts/workload-identity#kubernetes-resources-iam-policies)
 to grant `roles/storage.objectAdmin` access to Kubernetes service
 account.

From 504075259bc68bd5999f1ea9df20e26ce03a0c81 Mon Sep 17 00:00:00 2001
From: lepan-google <lepan@google.com>
Date: Fri, 5 Dec 2025 23:32:05 +0000
Subject: [PATCH 08/10] Correct format

---
 .../tensorrt-llm-gcs/README.md                | 32 +++++++++----------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/inference/a4x/single-host-serving/tensorrt-llm-gcs/README.md b/inference/a4x/single-host-serving/tensorrt-llm-gcs/README.md
index 04e6aa8..d3bc780 100644
--- a/inference/a4x/single-host-serving/tensorrt-llm-gcs/README.md
+++ b/inference/a4x/single-host-serving/tensorrt-llm-gcs/README.md
@@ -7,23 +7,21 @@ This guide walks you through setting up the necessary cloud infrastructure, conf
 <a name="table-of-contents"></a>
 ## Table of Contents
 
-- [Single Host Model Serving with NVIDIA TensorRT-LLM (TRT-LLM) and Google Cloud Storage on A4X GKE Node Pool](#single-host-model-serving-with-nvidia-tensorrt-llm-trt-llm-and-google-cloud-storage-on-a4x-gke-node-pool)
-  - [Table of Contents](#table-of-contents)
-  - [1. Test Environment](#1-test-environment)
-  - [2. High-Level Flow](#2-high-level-flow)
-  - [3. Environment Setup (One-Time)](#3-environment-setup-one-time)
-    - [3.1. Clone the Repository](#31-clone-the-repository)
-    - [3.2. Configure Environment Variables](#32-configure-environment-variables)
-    - [3.3. Connect to your GKE Cluster](#33-connect-to-your-gke-cluster)
-    - [3.4 Upload the Model Checkpoints](#34-upload-the-model-checkpoints)
-    - [3.5 Create Persistent Volumes and Persistent Volume Claims](#35-create-persistent-volumes-and-persistent-volume-claims)
-    - [3.6 Grant Storage Permission to Kubernetes Service Account](#36-grant-storage-permission-to-kubernetes-service-account)
-  - [4. Run the recipe](#4-run-the-recipe)
-    - [4.1. Inference benchmark for DeepSeek-R1 671B Model](#41-inference-benchmark-for-deepseek-r1-671b-model)
-  - [5. Monitoring and Troubleshooting](#5-monitoring-and-troubleshooting)
-    - [5.1. Check Deployment Status](#51-check-deployment-status)
-    - [5.2. View Logs](#52-view-logs)
-  - [6. Cleanup](#6-cleanup)
+* [1. Test Environment](#test-environment)
+* [2. High-Level Architecture](#architecture)
+* [3. Environment Setup (One-Time)](#environment-setup)
+  * [3.1. Clone the Repository](#clone-repo)
+  * [3.2. Configure Environment Variables](#configure-vars)
+  * [3.3. Connect to your GKE Cluster](#connect-cluster)
+  * [3.4. Upload the Model Checkpoints](#upload-the-model-checkpoints)
+  * [3.5. Create Persistent Volumes and Persistent Volume Claims](#create-persistent-volumes-and-persistent-volume-claims)
+  * [3.6. Grant Storage Permissions to Kubernetes Service Account](#grant-storage-permission-to-kubernetes-service-account)
+* [4. Run the Recipe](#run-the-recipe)
+  * [4.1. Inference benchmark for DeepSeek-R1 671B](#serving-deepseek-r1-671b)
+* [5. Monitoring and Troubleshooting](#monitoring)
+  * [5.1. Check Deployment Status](#check-status)
+  * [5.2. View Logs](#view-logs)
+* [6. Cleanup](#cleanup)
 
 <a name="test-environment"></a>
 ## 1. Test Environment

From c68a34d2e95731176fb0f8b03fae0613b6a8fc0e Mon Sep 17 00:00:00 2001
From: lepan-google <lepan@google.com>
Date: Wed, 10 Dec 2025 08:44:57 +0000
Subject: [PATCH 09/10] [A4X TensorRT Inference Benchmark] A4X DeepSeek R1
 NVFP4 on TensorRT with Lustre Storage

This change added deployment configs and instructions for A4X DeepSeek R1 NVFP4 on TensorRT with Lustre storage. This recipe is modified based on existing [CMCS recipe with HuggingFace](AI-Hypercomputer#50) and [GCS recipe](https://github.com/AI-Hypercomputer/gpu-recipes/pull/55).

TESTED=local tests
---
 .../tensorrt-llm-lustre/README.md             | 453 ++++++++++++++++++
 .../tensorrt-llm-lustre/values.yaml           |  63 +++
 .../deployment/Chart.yaml                     |  20 +
 .../templates/serving-config-configmap.yaml   |  25 +
 .../templates/serving-launcher-configmap.yaml |  27 ++
 .../templates/serving-launcher.yaml           | 268 +++++++++++
 .../deployment/templates/serving-svc.yaml     |  26 +
 src/helm-charts/storage/lustre/Chart.yaml     |  20 +
 .../storage/lustre/templates/pv.yaml          |  39 ++
 .../storage/lustre/templates/pvc.yaml         |  29 ++
 src/helm-charts/storage/lustre/values.yaml    |  26 +
 src/launchers/trtllm-launcher-lustre.sh       | 211 ++++++++
 12 files changed, 1207 insertions(+)
 create mode 100644 inference/a4x/single-host-serving/tensorrt-llm-lustre/README.md
 create mode 100644 inference/a4x/single-host-serving/tensorrt-llm-lustre/values.yaml
 create mode 100644 src/helm-charts/a4x/inference-templates-lustre/deployment/Chart.yaml
 create mode 100644 src/helm-charts/a4x/inference-templates-lustre/deployment/templates/serving-config-configmap.yaml
 create mode 100644 src/helm-charts/a4x/inference-templates-lustre/deployment/templates/serving-launcher-configmap.yaml
 create mode 100644 src/helm-charts/a4x/inference-templates-lustre/deployment/templates/serving-launcher.yaml
 create mode 100644 src/helm-charts/a4x/inference-templates-lustre/deployment/templates/serving-svc.yaml
 create mode 100644 src/helm-charts/storage/lustre/Chart.yaml
 create mode 100644 src/helm-charts/storage/lustre/templates/pv.yaml
 create mode 100644 src/helm-charts/storage/lustre/templates/pvc.yaml
 create mode 100644 src/helm-charts/storage/lustre/values.yaml
 create mode 100644 src/launchers/trtllm-launcher-lustre.sh

diff --git a/inference/a4x/single-host-serving/tensorrt-llm-lustre/README.md b/inference/a4x/single-host-serving/tensorrt-llm-lustre/README.md
new file mode 100644
index 0000000..4c63b68
--- /dev/null
+++ b/inference/a4x/single-host-serving/tensorrt-llm-lustre/README.md
@@ -0,0 +1,453 @@
+# Single Host Model Serving with NVIDIA TensorRT-LLM (TRT-LLM) and Google Cloud Managed Lustre on A4X GKE Node Pool
+
+This document outlines the steps to serve and benchmark various Large Language Models (LLMs) using the [NVIDIA TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) framework on a single [A4X GKE Node pool](https://cloud.google.com/kubernetes-engine), with model stored in [Google Cloud Managed Lustre](https://docs.cloud.google.com/managed-lustre/docs/overview).
+
+This guide walks you through setting up the necessary cloud infrastructure, configuring your environment, and deploying a high-performance LLM for inference.
+
+<a name="table-of-contents"></a>
+## Table of Contents
+
+- [Single Host Model Serving with NVIDIA TensorRT-LLM (TRT-LLM) and Google Cloud Managed Lustre on A4X GKE Node Pool](#single-host-model-serving-with-nvidia-tensorrt-llm-trt-llm-and-google-cloud-managed-lustre-on-a4x-gke-node-pool)
+  - [Table of Contents](#table-of-contents)
+  - [1. Test Environment](#1-test-environment)
+  - [2. High-Level Flow](#2-high-level-flow)
+  - [3. Environment Setup (One-Time)](#3-environment-setup-one-time)
+    - [3.1. Clone the Repository](#31-clone-the-repository)
+    - [3.2. Configure Environment Variables](#32-configure-environment-variables)
+    - [3.3. Connect to your GKE Cluster](#33-connect-to-your-gke-cluster)
+    - [3.4 Upload the Model Checkpoints](#34-upload-the-model-checkpoints)
+    - [3.5 Create Persistent Volumes and Persistent Volume Claims](#35-create-persistent-volumes-and-persistent-volume-claims)
+    - [3.6 Enable Managed Lustre CSI Driver on an Existing GKE Cluster](#36-enable-managed-lustre-csi-driver-on-an-existing-gke-cluster)
+  - [4. Run the recipe](#4-run-the-recipe)
+    - [4.1. Inference benchmark for DeepSeek-R1 671B Model](#41-inference-benchmark-for-deepseek-r1-671b-model)
+  - [5. Monitoring and Troubleshooting](#5-monitoring-and-troubleshooting)
+    - [5.1. Check Deployment Status](#51-check-deployment-status)
+    - [5.2. View Logs](#52-view-logs)
+  - [6. Cleanup](#6-cleanup)
+
+<a name="test-environment"></a>
+## 1. Test Environment
+
+[Back to Top](#table-of-contents)
+
+The recipe uses the following setup:
+
+* **Orchestration**: [Google Kubernetes Engine (GKE)](https://cloud.google.com/kubernetes-engine)
+* **Deployment Configuration**: A [Helm chart](https://helm.sh/) is used to configure and deploy a [Kubernetes Deployment](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/). This deployment encapsulates the inference of the target LLM using the TensorRT-LLM framework.  A separate Helm chart creates the necessary Persistent Volume (PV) and Persistent Volume Claim (PVC), facilitating access to the model stored in
+  [Google Cloud Managed Lustre](https://docs.cloud.google.com/managed-lustre/docs/overview).
+
+This recipe has been optimized for and tested with the following configuration:
+
+* **GKE Cluster**:
+    * A [regional standard cluster](https://cloud.google.com/kubernetes-engine/docs/concepts/configuration-overview) version: `1.33.4-gke.1036000` or later.
+    * A GPU node pool with 1 [a4x-highgpu-4g](https://cloud.google.com/compute/docs/gpus) machine.
+    * [Workload Identity Federation for GKE](https://cloud.google.com/kubernetes-engine/docs/concepts/workload-identity) enabled.
+    * [Cloud Storage FUSE CSI driver for GKE](https://cloud.google.com/kubernetes-engine/docs/concepts/cloud-storage-fuse-csi-driver) enabled.
+    * [DCGM metrics](https://cloud.google.com/kubernetes-engine/docs/how-to/dcgm-metrics) enabled.
+    * [Kueue](https://kueue.sigs.k8s.io/docs/reference/kueue.v1beta1/) and [JobSet](https://jobset.sigs.k8s.io/docs/overview/) APIs installed.
+    * Kueue configured to support [Topology Aware Scheduling](https://kueue.sigs.k8s.io/docs/concepts/topology_aware_scheduling/).
+* A regional Google Cloud Storage (GCS) bucket to store logs generated by the recipe runs.
+* A Google Cloud Managed Lustre instance to store model folder. The Lustre instance should be in the same PVC network as your GKE cluster.
+
+> [!IMPORTANT]
+> - To prepare the required environment, see the [GKE environment setup guide](../../../../docs/configuring-environment-gke-a4x.md).
+> Provisioning a new GKE cluster is a long-running operation and can take **20-30 minutes**.
+> - GCS buckets and Lustre instances must be in the same region as the GKE cluster.
+
+<a name="architecture"></a>
+## 2. High-Level Flow
+
+[Back to Top](#table-of-contents)
+
+Here is a simplified diagram of the flow that we follow in this recipe:
+
+```mermaid
+---
+config:
+  layout: dagre
+---
+flowchart TD
+ subgraph workstation["Client Workstation"]
+    T["Cluster Toolkit"]
+    B("Kubernetes API")
+    A["helm install"]
+  end
+ subgraph gke["GKE Cluster (A4X)"]
+    C["Deployment"]
+    D["Pod"]
+    E["TensorRT-LLM container"]
+    F["Service"]
+  end
+ subgraph storage["Managed Lustre"]
+    G["Model Lustre Instance"]
+  end
+ subgraph storage["Cloud Storage"]
+    H["Logs Bucket"]
+  end
+
+    %% Logical/actual flow
+    T -- Create Cluster --> gke
+    A --> B
+    B --> C & F
+    C --> D
+    D --> E
+    F --> C
+    G -- Load at runtime --> E
+    E -- Write logs --> H
+
+
+    %% Layout control
+    gke
+```
+
+* **helm:** A package manager for Kubernetes to define, install, and upgrade applications. It's used here to configure and deploy the Kubernetes Deployment.
+* **Deployment:** Manages the lifecycle of your model server pod, ensuring it stays running.
+* **Service:** Provides a stable network endpoint (a DNS name and IP address) to access your model server.
+* **Pod:** The smallest deployable unit in Kubernetes. The Triton server container with TensorRT-LLM runs inside this pod on a GPU-enabled node.
+* **Cloud Storage:** Cloud Storage buckets to store benchmark logs and other artifacts.
+* **Managed Lustre:** Managed Lustre instances to store model folder.
+
+<a name="environment-setup"></a>
+## 3. Environment Setup (One-Time)
+
+[Back to Top](#table-of-contents)
+
+First, you'll configure your local environment. These steps are required once before you can deploy any models.
+
+<a name="clone-repo"></a>
+### 3.1. Clone the Repository
+
+```bash
+git clone https://github.com/ai-hypercomputer/gpu-recipes.git
+cd gpu-recipes
+export REPO_ROOT=$(pwd)
+export RECIPE_ROOT=$REPO_ROOT/inference/a4x/single-host-serving/tensorrt-llm-lustre
+```
+
+<a name="configure-vars"></a>
+### 3.2. Configure Environment Variables
+
+This is the most critical step. These variables are used in subsequent commands to target the correct resources.
+
+```bash
+export PROJECT_ID=<YOUR_PROJECT_ID>
+export CLUSTER_REGION=<YOUR_CLUSTER_REGION>
+export CLUSTER_NAME=<YOUR_CLUSTER_NAME>
+export KUEUE_NAME=<YOUR_KUEUE_NAME>
+export GCS_BUCKET_LOGS=<YOUR_GCS_BUCKET_LOGS>
+export LUSTRE_INSTANCE_NAME_SERVING_MODEL=<YOUR_LUSTRE_INSTANCE_NAME_SERVING_MODEL>
+export LUSTER_FOLDER_SERVING_MODEL=<YOUR_LUSTRE_FOLDER_SERVING_MODEL>
+export LUSTRE_CAPACITY_SERVING_MODEL=<YOUR_LUSTRE_CAPACITY_SERVING_MODEL>
+export LUSTRE_PROJECT_SERVING_MODEL=<YOUR_LUSTRE_PROJECT_ID_SERVING_MODEL>
+export LUSTRE_LOCATION_SERVING_MODEL=<YOUR_LUSTRE_LOCATION_SERVING_MODEL>
+export LUSTRE_IP_ADDRESS_SERVING_MODEL=<YOUR_LUSTRE_IP_ADDRESS_SERVING_MODEL>
+export LUSTRE_FILE_SYSTEM_SERVING_MODEL=<YOUR_LUSTRE_FILE_SYSTEM_SERVING_MODEL>
+export TRTLLM_VERSION=1.2.0rc2
+
+export PROJECT_ID=supercomputer-testing
+export CLUSTER_REGION=us-central1
+export CLUSTER_NAME=a4x-baker
+export GCS_BUCKET_LOGS=tess-benchmark-outputs
+export LUSTRE_INSTANCE_NAME_SERVING_MODEL=a4x-baker
+export LUSTER_FOLDER_SERVING_MODEL=DeepSeek-R1-NVFP4-v2
+export LUSTRE_CAPACITY_SERVING_MODEL=126000Gi
+export LUSTRE_PROJECT_SERVING_MODEL=supercomputer-testing
+export LUSTRE_LOCATION_SERVING_MODEL=us-central1-b
+export LUSTRE_IP_ADDRESS_SERVING_MODEL=172.21.47.3
+export LUSTRE_FILE_SYSTEM_SERVING_MODEL=lustrefs
+export TRTLLM_VERSION=1.2.0rc2
+
+# Set the project for gcloud commands
+gcloud config set project $PROJECT_ID
+```
+
+Replace the following values:
+
+| Variable              | Description                                                                                             | Example                                                 |
+| --------------------- | ------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- |
+| `PROJECT_ID` | Your Google Cloud Project ID. | `gcp-project-12345` |
+| `CLUSTER_REGION` | The GCP region where your GKE cluster is located. | `us-central1` |
+| `CLUSTER_NAME` | The name of your GKE cluster. | `a4x-cluster` |
+| `KUEUE_NAME` | The name of the Kueue local queue. The default queue created by the cluster toolkit is `a4x`. Verify the name in your cluster. | `a4x` |
+| `GCS_BUCKET_LOGS` | Name of your GCS logs bucket (do not include `gs://`). | `my-benchmark-logs-bucket` |
+| `LUSTRE_INSTANCE_NAME_SERVING_MODEL` | The name of your Lustre instance. | `my-benchmark-model-lustre` |
+| `LUSTER_FOLDER_SERVING_MODEL` | The path to the GCS model folder on the Lustre. | `my-benchmark-model-folder` |
+| `LUSTRE_CAPACITY_SERVING_MODEL` | The capacity of your Lustre instance. | `my-benchmark-model-folder` |
+| `LUSTRE_PROJECT_SERVING_MODEL` | The project where your Lustre instance resides. | `my-benchmark-model-folder` |
+| `LUSTRE_LOCATION_SERVING_MODEL` | The zonal location of your Lustre instance. | `my-benchmark-model-folder` |
+| `LUSTRE_IP_ADDRESS_SERVING_MODEL` | The IP address of your Lustre instance, it can be obtained from the mountPoint field.  | `my-benchmark-model-folder` |
+| `LUSTRE_FILE_SYSTEM_SERVING_MODEL` | The file system name of your Managed Lustre instance. | `my-benchmark-model-folder` |
+| `TRTLLM_VERSION` | The tag/version for the Docker image. Other verions can be found at https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release | `1.2.0rc2` |
+
+To locate your Managed Lustre instance and collect the Lustre instance information, you can run the following command:
+
+```
+gcloud lustre instances list \
+    --project=${LUSTRE_PROJECT_SERVING_MODEL} \
+    --location=${LUSTRE_LOCATION_SERVING_MODEL}
+```
+
+The output should look similar to the following. Before you proceed to the next step, make sure to note down the Managed Lustre instance name, filesystem, and the mountPoint fields.
+
+```
+capacityGib: '9000'
+createTime: '2025-04-28T22:42:11.140825450Z'
+filesystem: testlfs
+gkeSupportEnabled: true
+mountPoint: 10.90.1.4@tcp:/testlfs
+name: projects/my-project/locations/us-central1-a/instances/my-lustre
+network: projects/my-project/global/networks/default
+perUnitStorageThroughput: '1000'
+state: ACTIVE
+updateTime: '2025-04-28T22:51:41.559098631Z'
+```
+
+<a name="connect-cluster"></a>
+### 3.3. Connect to your GKE Cluster
+
+Fetch credentials for `kubectl` to communicate with your cluster.
+
+```bash
+gcloud container clusters get-credentials $CLUSTER_NAME --region $CLUSTER_REGION
+```
+
+<a name="download-model-checkpoints"></a>
+### 3.4 Upload the Model Checkpoints
+
+To download the model from HuggingFace, please follow the steps below:
+
+1.  Follow these [instructions](https://docs.cloud.google.com/managed-lustre/docs/connect-from-compute-engine) to create a compute
+    engine and mount your Lustre instance onto it.
+3.  Access into the mount point on compute engine and create the model folder.
+4.  Under the mount point,
+    [download](https://huggingface.co/docs/hub/en/models-downloading) the model
+    using the `hf` command:
+
+    ```bash
+    hf download {MODEL_NAME} --local-dir .
+    ```
+
+<a name="create-pv-pvc"></a>
+### 3.5 Create Persistent Volumes and Persistent Volume Claims
+
+The inference deployment accesses Lustre instances for serving model using
+Kubernetes Persistent Volumes (PV) and Persistent Volume
+Claims (PVC). You must generate PVs and PVCs for serving model lustre
+instances using the
+[lustre helper Helm chart](../../../../src/helm-charts/storage/lustre).
+
+```bash
+helm install -f $REPO_ROOT/src/helm-charts/storage/lustre/values.yaml \
+--set lustreVolumes[0].instance_name=${LUSTRE_INSTANCE_NAME_SERVING_MODEL} \
+--set lustreVolumes[0].project_id=${LUSTRE_PROJECT_SERVING_MODEL} \
+--set lustreVolumes[0].location=${LUSTRE_LOCATION_SERVING_MODEL} \
+--set lustreVolumes[0].ip_address=${LUSTRE_IP_ADDRESS_SERVING_MODEL} \
+--set lustreVolumes[0].capacity=${LUSTRE_CAPACITY_SERVING_MODEL} \
+--set lustreVolumes[0].file_system=${LUSTRE_FILE_SYSTEM_SERVING_MODEL} \
+$USER-lustre-pv-pvc \
+$REPO_ROOT/src/helm-charts/storage/lustre
+```
+
+<a name="enable-lustre-csi-driver"></a>
+### 3.6 Enable Managed Lustre CSI Driver on an Existing GKE Cluster
+
+If the Managed Lustre CSI driver is not enabled on your GKE cluster, please
+follow these [instructions](https://docs.cloud.google.com/kubernetes-engine/docs/how-to/persistent-volumes/lustre-csi-driver-existing-instance#existing-cluster)
+to enable it.
+
+<a name="run-the-recipe"></a>
+## 4. Run the recipe
+
+[Back to Top](#table-of-contents)
+
+This recipe supports the deployment of the following models:
+
+1.  [DeepSeek-R1-NVFP4-v2](#serving-deepseek-r1)
+
+> [!NOTE]
+> After running the recipe with `helm install`, it can take **up to 30 minutes** for the deployment to become fully available. This is because the GKE node must first pull the Docker image.
+
+<a name="serving-deepseek-r1"></a>
+### 4.1. Inference benchmark for DeepSeek-R1 671B Model
+
+[Back to Top](#table-of-contents)
+
+The recipe runs inference throughputs benchmark for [DeepSeek-R1 671B NVFP4 model](https://huggingface.co/nvidia/DeepSeek-R1-NVFP4-v2) which is Nvidia's pre-quantized FP4 checkpoint of the original [DeepSeek-R1 671B model](https://huggingface.co/deepseek-ai/DeepSeek-R1).
+
+The recipe uses [`trtllm-bench`](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/performance/perf-benchmarking.md), a command-line tool from NVIDIA to benchmark the performance of TensorRT-LLM engine. For more information about `trtllm-bench`, see the [TensorRT-LLM documentation](https://github.com/NVIDIA/TensorRT-LLM).
+
+> [!NOTE]
+> The config file directly exposes the settings within TensorRT-LLM's llm_args.py class, which are passed to `trtllm-bench`, you can modify these as needed in [`src/frameworks/a4x/trtllm-configs/deepseek-r1-nvfp4.yaml`](../../../../src/frameworks/a4x/trtllm-configs/deepseek-r1-nvfp4.yaml)
+
+1. Install the helm chart to prepare and benchmark the model using [`trtllm-bench`](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/performance/perf-benchmarking.md) tool:
+
+    ```bash
+    cd $RECIPE_ROOT
+    helm install -f values.yaml \
+    --set-file workload_launcher=$REPO_ROOT/src/launchers/trtllm-launcher-lustre.sh \
+    --set-file serving_config=$REPO_ROOT/src/frameworks/a4x/trtllm-configs/deepseek-r1-nvfp4.yaml \
+    --set queue=${KUEUE_NAME} \
+    --set "volumes.gcsMounts[0].bucketName=${GCS_BUCKET_LOGS}" \
+    --set "volumes.pvcMounts[0].subPath=${LUSTER_FOLDER_SERVING_MODEL}" \
+    --set workload.model.name=nvidia/DeepSeek-R1-NVFP4-v2 \
+    --set workload.image=nvcr.io/nvidia/tensorrt-llm/release:${TRTLLM_VERSION} \
+    $USER-serving-deepseek-r1-model \
+    $REPO_ROOT/src/helm-charts/a4x/inference-templates-lustre/deployment
+    ```
+
+    This creates a Helm release and a Deployment named `$USER-serving-deepseek-r1-model`, and a Service named `$USER-serving-deepseek-r1-model-svc`.
+
+2.  **Check the deployment status.**
+
+    ```bash
+    kubectl get deployment/$USER-serving-deepseek-r1-model
+    ```
+
+    Wait until the `READY` column shows `1/1`. See the [Monitoring and Troubleshooting](#monitoring) section to view the deployment logs.
+
+  > [!NOTE]
+  > - This helm chart is configured to run only a single benchmarking experiment for 1k requests for 128 tokens of input/output lengths. To run other experiments, you can add the various combinations provided in the [values.yaml](values.yaml) file.
+
+
+<a name="monitoring"></a>
+## 5. Monitoring and Troubleshooting
+
+[Back to Top](#table-of-contents)
+
+After the model is deployed via Helm as described in the sections [above](#run-the-recipe), use the following steps to monitor the deployment and interact with the model. Replace `<deployment-name>` and `<service-name>` with the appropriate names from the model-specific deployment instructions (e.g., `$USER-serving-deepseek-r1-model` and `$USER-serving-deepseek-r1-model-svc`).
+
+
+<a name="check-status"></a>
+### 5.1. Check Deployment Status
+
+Check the status of your deployment. Replace the name if you deployed a different model.
+
+```bash
+# Example for DeepSeek-R1 671B
+kubectl get deployment/$USER-serving-deepseek-r1-model
+```
+
+Wait until the `READY` column shows `1/1`. If it shows `0/1`, the pod is still starting up.
+
+> [!NOTE]
+> In the GKE UI on Cloud Console, you might see a status of "Does not have minimum availability" during startup. This is normal and will resolve once the pod is ready.
+
+<a name="view-logs"></a>
+### 5.2. View Logs
+
+To see the logs from the TRTLLM server (useful for debugging), use the `-f` flag to follow the log stream:
+
+```bash
+kubectl logs -f deployment/$USER-serving-deepseek-r1-model
+```
+
+You should see logs indicating preparing the model, and then running the throughput benchmark test, similar to this:
+
+```bash
+Running benchmark for nvidia/DeepSeek-R1-NVFP4-v2 with ISL=128, OSL=128, TP=4, EP=4, PP=1
+
+===========================================================
+= PYTORCH BACKEND
+===========================================================
+Model:			nvidia/DeepSeek-R1-NVFP4-v2
+Model Path:		/ssd/nvidia/DeepSeek-R1-NVFP4-v2
+TensorRT LLM Version:	1.2
+Dtype:			bfloat16
+KV Cache Dtype:		FP8
+Quantization:		NVFP4
+
+===========================================================
+= REQUEST DETAILS
+===========================================================
+Number of requests:             1000
+Number of concurrent requests:  985.9849
+Average Input Length (tokens):  128.0000
+Average Output Length (tokens): 128.0000
+===========================================================
+= WORLD + RUNTIME INFORMATION
+===========================================================
+TP Size:                4
+PP Size:                1
+EP Size:                4
+Max Runtime Batch Size: 2304
+Max Runtime Tokens:     4608
+Scheduling Policy:      GUARANTEED_NO_EVICT
+KV Memory Percentage:   85.00%
+Issue Rate (req/sec):   8.3913E+13
+
+===========================================================
+= PERFORMANCE OVERVIEW
+===========================================================
+Request Throughput (req/sec):                     X.XX
+Total Output Throughput (tokens/sec):             X.XX
+Total Token Throughput (tokens/sec):              X.XX
+Total Latency (ms):                               X.XX
+Average request latency (ms):                     X.XX
+Per User Output Throughput [w/ ctx] (tps/user):   X.XX
+Per GPU Output Throughput (tps/gpu):              X.XX
+
+-- Request Latency Breakdown (ms) -----------------------
+
+[Latency] P50    : X.XX
+[Latency] P90    : X.XX
+[Latency] P95    : X.XX
+[Latency] P99    : X.XX
+[Latency] MINIMUM: X.XX
+[Latency] MAXIMUM: X.XX
+[Latency] AVERAGE: X.XX
+
+===========================================================
+= DATASET DETAILS
+===========================================================
+Dataset Path:         /ssd/token-norm-dist_DeepSeek-R1-NVFP4-v2_128_128_tp4.json
+Number of Sequences:  1000
+
+-- Percentiles statistics ---------------------------------
+
+        Input              Output           Seq. Length
+-----------------------------------------------------------
+MIN:   128.0000           128.0000           256.0000
+MAX:   128.0000           128.0000           256.0000
+AVG:   128.0000           128.0000           256.0000
+P50:   128.0000           128.0000           256.0000
+P90:   128.0000           128.0000           256.0000
+P95:   128.0000           128.0000           256.0000
+P99:   128.0000           128.0000           256.0000
+===========================================================
+```
+
+<a name="cleanup"></a>
+## 6. Cleanup
+
+[Back to Top](#table-of-contents)
+
+To avoid incurring further charges, clean up the resources you created.
+
+1.  **Uninstall the Helm Release:**
+
+    First, list your releases to get the deployed models:
+
+    ```bash
+    # list deployed models
+    helm list --filter $USER-serving-
+    ```
+
+    Then, uninstall the desired release:
+
+    ```bash
+    # uninstall the deployed model
+    helm uninstall <release_name>
+    ```
+    Replace `<release_name>` with the helm release names listed.
+
+2.  **Delete the Persistent Volume and Persistent Volume Claim:**
+
+    ```bash
+    # uninstall the deployed pv and pvc.
+    helm uninstall $USER-lustre-pv-pvc
+    ```
+
+3.  (Optional) Delete the built Docker image from Artifact Registry if no longer needed.
+4.  (Optional) Delete Cloud Build logs.
+5.  (Optional) Clean up files in your GCS bucket if benchmarking was performed.
+6.  (Optional) Delete the [test environment](#test-environment) provisioned including GKE cluster.
diff --git a/inference/a4x/single-host-serving/tensorrt-llm-lustre/values.yaml b/inference/a4x/single-host-serving/tensorrt-llm-lustre/values.yaml
new file mode 100644
index 0000000..ea3fb42
--- /dev/null
+++ b/inference/a4x/single-host-serving/tensorrt-llm-lustre/values.yaml
@@ -0,0 +1,63 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+queue:
+
+dwsSettings:
+  maxRunDurationSeconds:
+
+volumes:
+  gcsVolumes: true
+  ssdMountPath: "/ssd"
+  gcsMounts:
+    - bucketName:
+      mountPath: "/gcs-logs"
+  pvcMounts:
+    - claimName: "lustre-serving-model-pvc"
+      mountPath: "/serving-model"
+      subPath:
+
+service:
+  type: ClusterIP
+  ports:
+    http: 8000
+
+workload:
+  model:
+    name:
+  gpus: 4
+  image:
+  framework: trtllm
+  configFile: serving-args.yaml
+  configPath: /workload/configs
+  envs:
+    - name: LAUNCHER_SCRIPT
+      value: "/workload/launcher/launch-workload.sh"
+    - name: SERVER_ARGS_FILE
+      value: "/workload/configs/serving-args.yaml"
+  benchmarks:
+    experiments:
+      - isl: 128
+        osl: 128
+        num_requests: 1000
+
+network:
+  gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-arm64:v1.0.7
+  ncclSettings:
+    - name: NCCL_DEBUG
+      value: "VERSION"
+
+targetNodes:
+  - gke-a4x-baker-a4x-highgpu-4g-a4x-pool-42293301-2hvc
+
diff --git a/src/helm-charts/a4x/inference-templates-lustre/deployment/Chart.yaml b/src/helm-charts/a4x/inference-templates-lustre/deployment/Chart.yaml
new file mode 100644
index 0000000..4f584cc
--- /dev/null
+++ b/src/helm-charts/a4x/inference-templates-lustre/deployment/Chart.yaml
@@ -0,0 +1,20 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v2
+name: single-host-serving-deployment-template
+description: single-host-serving-deployment-template
+type: application
+version: 0.1.0
+appVersion: "1.16.0"
diff --git a/src/helm-charts/a4x/inference-templates-lustre/deployment/templates/serving-config-configmap.yaml b/src/helm-charts/a4x/inference-templates-lustre/deployment/templates/serving-config-configmap.yaml
new file mode 100644
index 0000000..a17bdf4
--- /dev/null
+++ b/src/helm-charts/a4x/inference-templates-lustre/deployment/templates/serving-config-configmap.yaml
@@ -0,0 +1,25 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: "{{ .Release.Name }}-config"
+data:
+  serving-configuration: |-
+{{- if .Values.serving_config  }}
+{{ .Values.serving_config | nindent 4 }}
+{{- else }}
+{{ "config: null" | nindent 4 }}
+{{- end }}
\ No newline at end of file
diff --git a/src/helm-charts/a4x/inference-templates-lustre/deployment/templates/serving-launcher-configmap.yaml b/src/helm-charts/a4x/inference-templates-lustre/deployment/templates/serving-launcher-configmap.yaml
new file mode 100644
index 0000000..b111553
--- /dev/null
+++ b/src/helm-charts/a4x/inference-templates-lustre/deployment/templates/serving-launcher-configmap.yaml
@@ -0,0 +1,27 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: "{{ .Release.Name }}-launcher"
+data:
+  launch-workload.sh: |-
+{{- if .Values.workload_launcher }}
+{{ .Values.workload_launcher | nindent 4 }}
+{{- else }}
+    #!/bin/bash
+    echo "No workload launcher specified"
+    exit 1
+{{- end }}
\ No newline at end of file
diff --git a/src/helm-charts/a4x/inference-templates-lustre/deployment/templates/serving-launcher.yaml b/src/helm-charts/a4x/inference-templates-lustre/deployment/templates/serving-launcher.yaml
new file mode 100644
index 0000000..e2d6e4f
--- /dev/null
+++ b/src/helm-charts/a4x/inference-templates-lustre/deployment/templates/serving-launcher.yaml
@@ -0,0 +1,268 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+{{ $nodes := div .Values.workload.gpus 4 | max 1 }}
+{{ $gpusPerNode := min .Values.workload.gpus 4 }}
+
+{{ $root := . }}
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: "{{ .Release.Name }}"
+  namespace: default
+  labels:
+    app: {{ .Release.Name }}-serving
+    {{- if $root.Values.queue }}
+    kueue.x-k8s.io/queue-name: "{{ $root.Values.queue }}"
+    {{- end }}
+spec:
+  replicas: {{ $nodes }}
+  selector:
+    matchLabels:
+      app: {{ .Release.Name }}-serving
+  template:
+    metadata:
+      labels:
+        app: {{ .Release.Name }}-serving
+      annotations:
+        kubectl.kubernetes.io/default-container: serving
+        {{- if or $root.Values.volumes.gcsVolumes }}
+        gke-gcsfuse/volumes: "true"
+        gke-gcsfuse/cpu-limit: "0"
+        gke-gcsfuse/memory-limit: "0"
+        gke-gcsfuse/ephemeral-storage-limit: "0"
+        {{- end }}
+        {{- if and $root.Values.queue $root.Values.dwsSettings.maxRunDurationSeconds }}
+        provreq.kueue.x-k8s.io/maxRunDurationSeconds: "{{ $root.Values.dwsSettings.maxRunDurationSeconds }}"
+        {{- end }}
+        {{- if not $root.Values.network.hostNetwork }}
+        networking.gke.io/default-interface: "eth0"
+        {{- end }}
+    spec:
+      {{- if $root.Values.network.hostNetwork }}
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet
+      {{- end }}
+      subdomain: "{{.Release.Name}}"
+      restartPolicy: Always
+      {{- if $root.Values.targetNodes }}
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: kubernetes.io/hostname
+                operator: In
+                values:
+                {{ range $hostname := $root.Values.targetNodes }}
+                - {{ $hostname }}
+                {{ end }}
+      {{- end }}
+      tolerations:
+      - operator: "Exists"
+        key: nvidia.com/gpu
+      - operator: "Exists"
+        key: cloud.google.com/impending-node-termination
+      - key: "kubernetes.io/arch"
+        operator: "Equal"
+        value: "arm64"
+        effect: "NoSchedule"
+      volumes:
+        {{- if $root.Values.network.gibVersion }}
+        - name: gib
+          emptyDir: {}
+        {{- end }}
+        - name: serving-configuration
+          configMap:
+            name: "{{.Release.Name}}-config"
+            items:
+            - key: serving-configuration
+              path: {{ $root.Values.workload.configFile | default "serving-args" }}
+        - name: serving-launcher
+          configMap:
+            name: "{{.Release.Name}}-launcher"
+            defaultMode: 0700
+        - name: shared-memory
+          emptyDir:
+            medium: "Memory"
+            sizeLimit: 250Gi
+        {{- range $gcs := $root.Values.volumes.gcsMounts }}
+        - name: "{{ $gcs.bucketName }}"
+          csi:
+            driver: gcsfuse.csi.storage.gke.io
+            volumeAttributes:
+              bucketName: "{{ $gcs.bucketName }}"
+              {{- if $gcs.mountOptions }}
+              mountOptions: "{{ $gcs.mountOptions }}"
+              {{- end }}
+        {{- end }}
+        {{ if (gt (len $root.Values.volumes.pvcMounts) 0) }}
+        - name: "{{ (index $root.Values.volumes.pvcMounts 0).claimName }}"
+          persistentVolumeClaim:
+            claimName: "{{ (index $root.Values.volumes.pvcMounts 0).claimName }}"
+        {{- end }}
+        {{- if $root.Values.volumes.ssdMountPath }}
+        - name: local-ssd
+          hostPath:
+            path: /mnt/stateful_partition/kube-ephemeral-ssd
+        {{- end }}
+
+      initContainers:
+      {{- if $root.Values.network.gibVersion }}
+      - name: nccl-plugin-installer
+        image: {{ $root.Values.network.gibVersion }}
+        imagePullPolicy: Always
+        args:
+        - |
+          set -ex
+          /scripts/container_entry.sh install --install-nccl
+          cp -R /var/lib/gib/lib64/. /target/usr/local/gib/lib64
+          cp -R /var/lib/gib/. /target/usr/local/gib
+        command:
+        - /bin/sh
+        - -c
+        volumeMounts:
+        - mountPath: /target/usr/local/gib
+          name: gib
+      {{- end }}
+
+      containers:
+        {{- if $root.Values.workload.gcsSidecarImage }}
+        - name: gke-gcsfuse-sidecar
+          image: {{ $root.Values.workload.gcsSidecarImage }}
+        - name: gke-gcsfuse-metadata-prefetch
+          image: {{ $root.Values.workload.gcsSidecarImage }}
+        {{- end }}
+        - name: serving
+          image: "{{ $root.Values.workload.image }}"
+          imagePullPolicy: Always
+          {{- if $root.Values.network.hostNetwork }}
+          securityContext:
+            privileged: true
+          {{- end }}
+          env:
+            {{- if $root.Values.network.ncclSettings }}
+            {{- toYaml .Values.network.ncclSettings | nindent 12 }}
+            {{- end }}
+            - name: NCCL_PLUGIN_PATH
+              value: /usr/local/gib/lib64
+            - name: LD_LIBRARY_PATH
+              value: /usr/local/gib/lib64:/usr/local/nvidia/lib64
+            {{- if $root.Values.network.gibVersion }}
+            - name: NCCL_INIT_SCRIPT
+              value: "/usr/local/gib/scripts/set_nccl_env.sh"
+            {{- end }}
+            # Workload specific environment variables
+            - name: MODEL_NAME
+              value: "{{ $root.Values.workload.model.name }}"
+            - name: TRTLLM_DIR
+              value: "/app/tensorrt_llm"
+            {{- if $root.Values.workload.envs }}
+            {{- toYaml .Values.workload.envs | nindent 12 }}
+            {{- end }}
+
+          workingDir: /workload
+          command: ["/bin/bash", "-c"]
+          args:
+            - |
+              #!/bin/bash
+
+              if [ ! -f "$LAUNCHER_SCRIPT" ]; then
+                echo "Error: Launcher script $LAUNCHER_SCRIPT not found!"
+                exit 1
+              fi
+
+              ARGS=()
+              EXTRA_ARGS_FILE="/tmp/extra_llm_api_args.yaml"
+
+              # Use Python to parse the main config file, extract llm_api_args,
+              # and generate the command-line arguments.
+              python -c "
+              import yaml
+              import sys
+
+              args = []
+              llm_api_args = {}
+              config_file = sys.argv[1]
+              extra_args_file = sys.argv[2]
+
+              try:
+                with open(config_file, 'r') as f:
+                  config = yaml.safe_load(f)
+
+                if 'llm_api_args' in config:
+                  llm_api_args = config.pop('llm_api_args')
+                  with open(extra_args_file, 'w') as f:
+                    yaml.dump(llm_api_args, f)
+
+                for key, value in config.items():
+                  if value is True:
+                    args.append(f'--{key}')
+                  elif value is not False:
+                    args.append(f'--{key}')
+                    args.append(str(value))
+
+                # Print the arguments for the shell script to capture
+                print(' '.join(args))
+
+              except Exception as e:
+                print(f'Error parsing config file: {e}', file=sys.stderr)
+                sys.exit(1)
+              " "$SERVER_ARGS_FILE" "$EXTRA_ARGS_FILE" > /tmp/launcher_args.txt
+
+              # Read the generated arguments into the ARGS array
+              mapfile -t ARGS < <(tr ' ' '\n' < /tmp/launcher_args.txt)
+              rm /tmp/launcher_args.txt
+
+              {{ if eq $root.Values.workload.framework "trtllm" }}
+              {{- range $root.Values.workload.benchmarks.experiments }}
+              echo "Running: $LAUNCHER_SCRIPT --model_name $MODEL_NAME --isl {{ .isl }} --osl {{ .osl }} --num_requests {{ .num_requests }} -- ${ARGS[@]}"
+              exec "$LAUNCHER_SCRIPT" --model_name $MODEL_NAME --isl {{ .isl }} --osl {{ .osl }} --num_requests {{ .num_requests }} -- "${ARGS[@]}"
+              {{- end }}
+              {{ else }}
+              echo "Running: $LAUNCHER_SCRIPT ${ARGS[@]}"
+              exec "$LAUNCHER_SCRIPT" "${ARGS[@]}"
+              {{- end }}
+
+          volumeMounts:
+            {{- if $root.Values.network.gibVersion }}
+            - name: gib
+              mountPath: /usr/local/gib
+            {{- end }}
+            - name: serving-configuration
+              mountPath: {{ $root.Values.workload.configPath | default "/workload/configs" }}
+            - name: serving-launcher
+              mountPath: /workload/launcher
+            - name: shared-memory
+              mountPath: /dev/shm
+            {{- range $gcs := $root.Values.volumes.gcsMounts }}
+            - name: "{{ $gcs.bucketName }}"
+              mountPath: "{{ $gcs.mountPath }}"
+            {{- end }}
+            {{ if (gt (len $root.Values.volumes.pvcMounts) 0) }}
+            - name: "{{ (index $root.Values.volumes.pvcMounts 0).claimName }}"
+              mountPath: "{{ (index $root.Values.volumes.pvcMounts 0).mountPath }}"
+              subPath: "{{ (index $root.Values.volumes.pvcMounts 0).subPath }}"
+            {{- end }}
+            {{- if $root.Values.volumes.ssdMountPath }}
+            - name: local-ssd
+              mountPath: "{{ $root.Values.volumes.ssdMountPath }}"
+            {{- end }}
+
+          resources:
+            requests:
+              nvidia.com/gpu: {{ $gpusPerNode }}
+            limits:
+              nvidia.com/gpu: {{ $gpusPerNode }}
diff --git a/src/helm-charts/a4x/inference-templates-lustre/deployment/templates/serving-svc.yaml b/src/helm-charts/a4x/inference-templates-lustre/deployment/templates/serving-svc.yaml
new file mode 100644
index 0000000..3d1363b
--- /dev/null
+++ b/src/helm-charts/a4x/inference-templates-lustre/deployment/templates/serving-svc.yaml
@@ -0,0 +1,26 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Service
+metadata:
+  name: {{ .Release.Name }}-svc
+spec:
+  selector:
+    app: {{ .Release.Name }}-serving
+  ports:
+    - name: http
+      port: {{ .Values.service.ports.http }}
+      targetPort: {{ .Values.service.ports.http }}
+  type: {{ .Values.service.type }}
\ No newline at end of file
diff --git a/src/helm-charts/storage/lustre/Chart.yaml b/src/helm-charts/storage/lustre/Chart.yaml
new file mode 100644
index 0000000..de86b15
--- /dev/null
+++ b/src/helm-charts/storage/lustre/Chart.yaml
@@ -0,0 +1,20 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v2
+name: lustre-pv-pvc
+description: "Lustre Volumes and Persistent Volume Claims"
+type: application
+version: 0.1.0
+appVersion: "0.1.0"
diff --git a/src/helm-charts/storage/lustre/templates/pv.yaml b/src/helm-charts/storage/lustre/templates/pv.yaml
new file mode 100644
index 0000000..800c8f9
--- /dev/null
+++ b/src/helm-charts/storage/lustre/templates/pv.yaml
@@ -0,0 +1,39 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+{{- range $lv := .Values.lustreVolumes }}
+---
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: "{{ $lv.name }}-pv"
+spec:
+  storageClassName: ""
+  capacity:
+    storage: {{ $lv.capacity }}
+  accessModes:
+    - ReadWriteMany
+  persistentVolumeReclaimPolicy: Retain
+  volumeMode: Filesystem
+  claimRef:
+    namespace: default
+    name: "{{ $lv.name }}-pvc"
+  csi:
+    driver: lustre.csi.storage.gke.io
+    volumeHandle: {{ $lv.project_id }}/{{ $lv.location }}/{{ $lv.instance_name }}
+    volumeAttributes:
+      ip: {{ $lv.ip_address }}
+      filesystem: {{ $lv.file_system }}
+---
+{{- end }}
diff --git a/src/helm-charts/storage/lustre/templates/pvc.yaml b/src/helm-charts/storage/lustre/templates/pvc.yaml
new file mode 100644
index 0000000..8d81ddc
--- /dev/null
+++ b/src/helm-charts/storage/lustre/templates/pvc.yaml
@@ -0,0 +1,29 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+{{- range $lv := .Values.lustreVolumes }}
+kind: PersistentVolumeClaim
+apiVersion: v1
+metadata:
+  name: {{ $lv.name }}-pvc
+spec:
+  accessModes:
+    - ReadWriteMany
+  storageClassName: ""
+  volumeName: {{ $lv.name }}-pv
+  resources:
+    requests:
+      storage: {{ $lv.capacity }}
+---
+{{- end }}
diff --git a/src/helm-charts/storage/lustre/values.yaml b/src/helm-charts/storage/lustre/values.yaml
new file mode 100644
index 0000000..89bcf73
--- /dev/null
+++ b/src/helm-charts/storage/lustre/values.yaml
@@ -0,0 +1,26 @@
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+lustreVolumes:
+  - name: lustre-serving-model
+    instance_name:
+    project_id:
+    location:
+    ip_address:
+    capacity:
+    file_system:
+
diff --git a/src/launchers/trtllm-launcher-lustre.sh b/src/launchers/trtllm-launcher-lustre.sh
new file mode 100644
index 0000000..4558da1
--- /dev/null
+++ b/src/launchers/trtllm-launcher-lustre.sh
@@ -0,0 +1,211 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/bin/bash
+
+# Exit immediately if a command exits with a non-zero status.
+set -eux
+
+echo "TensorRT-LLM benchmark arguments received:"
+echo "  $@"
+echo ""
+
+# Function to validate model name
+validate_model_name() {
+    if [ -z "$MODEL_NAME" ]; then
+        echo "Error: MODEL_NAME environment variable is not set."
+        exit 1
+    fi
+    echo "Using MODEL_NAME: $MODEL_NAME"
+}
+
+# Function to parse arguments
+parse_arguments() {
+    model_name=$MODEL_NAME
+    isl=128
+    osl=128
+    num_requests=30000
+
+    # Parse known arguments and check for unknown option or missing argument
+    PARSED_OPTIONS=$(getopt -o "" -l model_name:,isl:,osl:,num_requests: -- "$@")
+    if [ $? -ne 0 ]; then
+        echo "Error: Failed to parse arguments. Check for invalid options or missing values."
+        exit 1
+    fi
+
+    # set the shell's positional parameters
+    eval set -- "$PARSED_OPTIONS"
+
+    while true; do
+        case "$1" in
+        --model_name)
+            model_name="$2"
+            shift 2
+            ;;
+        --isl)
+            isl="$2"
+            shift 2
+            ;;
+        --osl)
+            osl="$2"
+            shift 2
+            ;;
+        --num_requests)
+            num_requests="$2"
+            shift 2
+            ;;
+        --)
+            shift
+            break
+            ;;
+        *)
+            echo "Internal error: Argument parsing issue. Unexpected option: $1"
+            exit 1
+            ;;
+        esac
+    done
+
+    SERVING_CONFIG=("$@")
+}
+
+# Function to parse serving config
+parse_serving_config() {
+    declare -g -A SERVING_CONFIG_DICT
+
+    for ((index = 0; index < ${#SERVING_CONFIG[@]}; )); do
+        current_arg="${SERVING_CONFIG[$index]}"
+        next_arg="${SERVING_CONFIG[$((index + 1))]}"
+
+        # Handle --key=value format
+        if [[ "$current_arg" =~ ^--[^=]+=.+ ]]; then
+            key=$(echo "$current_arg" | cut -d'=' -f1 | sed 's/--//')
+            value=$(echo "$current_arg" | cut -d'=' -f2-)
+            SERVING_CONFIG_DICT["$key"]="$value"
+            ((index++))
+        # Handle --key value format
+        elif [[ "$current_arg" =~ ^--[^=]+$ && -n "$next_arg" && ! "$next_arg" =~ ^-- ]]; then
+            # Check if:
+            # 1. Current arg starts with -- and has no '=' (e.g., --key)
+            # 2. There IS a next argument (`-n "$next_arg"`)
+            # 3. The next argument does NOT start with -- (meaning it's a value, not another option)
+            key=$(echo "$current_arg" | sed 's/--//')
+            value="$next_arg"
+            SERVING_CONFIG_DICT["$key"]="$value"
+            ((index += 2))
+        # Handle --flag (boolean flag without a value)
+        elif [[ "$current_arg" =~ ^--[^=]+$ ]]; then
+            # If the key was pre-defined with a default, this will overwrite it to 'true'.
+            # If not pre-defined, it will create it.
+            key=$(echo "$current_arg" | sed 's/--//')
+            SERVING_CONFIG_DICT["$key"]="true"
+            ((index++))
+        else
+            ((index++))
+        fi
+    done
+
+    tp_size=${SERVING_CONFIG_DICT["tp_size"]:=8}
+    pp_size=${SERVING_CONFIG_DICT["pp_size"]:=1}
+    ep_size=${SERVING_CONFIG_DICT["ep_size"]:=1}
+    backend=${SERVING_CONFIG_DICT["backend"]:="tensorrt"}
+    kv_cache_free_gpu_mem_fraction=${SERVING_CONFIG_DICT["kv_cache_free_gpu_mem_fraction"]:=0.95}
+}
+
+print_configuration() {
+    echo "TensorRT-LLM benchmark arguments received:"
+    echo "  $@"
+    echo ""
+    echo "--------------------------------"
+    echo "--- Parsed Arguments Summary ---"
+    echo "model name:              $model_name"
+    echo "input seq length:        $isl"
+    echo "output seq length:       $osl"
+    echo "number of requests:      $num_requests"
+    echo "tensor parallel size:    $tp_size"
+    echo "pipeline parallel size:  $pp_size"
+    echo "expert parallel size:    $ep_size"
+    echo "backend:                 $backend"
+    echo "kv_cache_free_gpu_mem_fraction: $kv_cache_free_gpu_mem_fraction"
+    echo "--------------------------------"
+}
+
+# Function to run benchmarks
+run_benchmark() {
+    local model_name=$1
+    local isl=$2
+    local osl=$3
+    local num_requests=$4
+    local tp_size=$5
+    local pp_size=$6
+    local ep_size=$7
+    local backend=$8
+    local kv_cache_free_gpu_mem_fraction=$9
+
+    echo "Running benchmark for $model_name with ISL=$isl, OSL=$osl, TP=$tp_size, PP=$pp_size, EP=$ep_size, backend=$7"
+
+    dataset_file="/ssd/token-norm-dist_${model_name##*/}_${isl}_${osl}_tp${tp_size}.json"
+    output_file="/ssd/output_${model_name##*/}_isl${isl}_osl${osl}_tp${tp_size}.txt"
+    extra_args_file="/tmp/extra_llm_api_args.yaml"
+    extra_args=""
+    if [ -f "$extra_args_file" ]; then
+        extra_args="--extra_llm_api_options $extra_args_file"
+    fi
+
+    exec > >(tee $output_file) 2>&1
+
+    echo "Preparing dataset"
+    python3 $TRTLLM_DIR/benchmarks/cpp/prepare_dataset.py \
+        --tokenizer /serving-model \
+        --stdout token-norm-dist \
+        --num-requests=$num_requests \
+        --input-mean=$isl \
+        --output-mean=$osl \
+        --input-stdev=0 \
+        --output-stdev=0 >$dataset_file
+
+    echo "Running throughput benchmark"
+    trtllm-bench \
+    --model $model_name \
+    --model_path /serving-model throughput \
+    --dataset $dataset_file \
+    --tp $tp_size \
+    --pp $pp_size \
+    --ep $ep_size \
+    --backend "pytorch" \
+    --kv_cache_free_gpu_mem_fraction $kv_cache_free_gpu_mem_fraction $extra_args
+
+    cp $output_file /gcs-logs/benchmark_logs/trtllm/
+
+    rm -f $dataset_file
+}
+
+# Main function to run the benchmark
+main() {
+    # parse arguments
+    validate_model_name
+    parse_arguments "$@"
+    parse_serving_config
+    print_configuration "$@"
+
+    # run benchmark
+    mkdir -p /gcs-logs/benchmark_logs/trtllm
+    echo "Running benchmarks"
+    run_benchmark "$model_name" $isl $osl $num_requests $tp_size $pp_size $ep_size $backend $kv_cache_free_gpu_mem_fraction
+}
+
+# Set environment variables
+export LD_LIBRARY_PATH=/usr/local/lib/python3.12/dist-packages/torch/lib:/usr/local/lib/python3.12/dist-packages/torch_tensorrt/lib:/usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/usr/local/tensorrt/lib
+
+# Run the main function
+main "$@"

From a923d09729e090ef7e8ca5c09fae48530d91d466 Mon Sep 17 00:00:00 2001
From: lepan-google <lepan@google.com>
Date: Wed, 10 Dec 2025 09:30:38 +0000
Subject: [PATCH 10/10] Fix README

---
 .../tensorrt-llm-lustre/README.md             | 54 ++++++-------------
 .../tensorrt-llm-lustre/values.yaml           |  4 --
 2 files changed, 17 insertions(+), 41 deletions(-)

diff --git a/inference/a4x/single-host-serving/tensorrt-llm-lustre/README.md b/inference/a4x/single-host-serving/tensorrt-llm-lustre/README.md
index 4c63b68..eb008c3 100644
--- a/inference/a4x/single-host-serving/tensorrt-llm-lustre/README.md
+++ b/inference/a4x/single-host-serving/tensorrt-llm-lustre/README.md
@@ -17,7 +17,6 @@ This guide walks you through setting up the necessary cloud infrastructure, conf
     - [3.3. Connect to your GKE Cluster](#33-connect-to-your-gke-cluster)
     - [3.4 Upload the Model Checkpoints](#34-upload-the-model-checkpoints)
     - [3.5 Create Persistent Volumes and Persistent Volume Claims](#35-create-persistent-volumes-and-persistent-volume-claims)
-    - [3.6 Enable Managed Lustre CSI Driver on an Existing GKE Cluster](#36-enable-managed-lustre-csi-driver-on-an-existing-gke-cluster)
   - [4. Run the recipe](#4-run-the-recipe)
     - [4.1. Inference benchmark for DeepSeek-R1 671B Model](#41-inference-benchmark-for-deepseek-r1-671b-model)
   - [5. Monitoring and Troubleshooting](#5-monitoring-and-troubleshooting)
@@ -43,6 +42,7 @@ This recipe has been optimized for and tested with the following configuration:
     * A GPU node pool with 1 [a4x-highgpu-4g](https://cloud.google.com/compute/docs/gpus) machine.
     * [Workload Identity Federation for GKE](https://cloud.google.com/kubernetes-engine/docs/concepts/workload-identity) enabled.
     * [Cloud Storage FUSE CSI driver for GKE](https://cloud.google.com/kubernetes-engine/docs/concepts/cloud-storage-fuse-csi-driver) enabled.
+    * [Managed Lustre CSI driver for GKE](https://docs.cloud.google.com/kubernetes-engine/docs/how-to/persistent-volumes/lustre-csi-driver-existing-instance#existing-cluster) enabled.
     * [DCGM metrics](https://cloud.google.com/kubernetes-engine/docs/how-to/dcgm-metrics) enabled.
     * [Kueue](https://kueue.sigs.k8s.io/docs/reference/kueue.v1beta1/) and [JobSet](https://jobset.sigs.k8s.io/docs/overview/) APIs installed.
     * Kueue configured to support [Topology Aware Scheduling](https://kueue.sigs.k8s.io/docs/concepts/topology_aware_scheduling/).
@@ -78,10 +78,10 @@ flowchart TD
     E["TensorRT-LLM container"]
     F["Service"]
   end
- subgraph storage["Managed Lustre"]
+ subgraph lustre["Managed Lustre"]
     G["Model Lustre Instance"]
   end
- subgraph storage["Cloud Storage"]
+ subgraph gcs["Cloud Storage"]
     H["Logs Bucket"]
   end
 
@@ -136,27 +136,14 @@ export CLUSTER_NAME=<YOUR_CLUSTER_NAME>
 export KUEUE_NAME=<YOUR_KUEUE_NAME>
 export GCS_BUCKET_LOGS=<YOUR_GCS_BUCKET_LOGS>
 export LUSTRE_INSTANCE_NAME_SERVING_MODEL=<YOUR_LUSTRE_INSTANCE_NAME_SERVING_MODEL>
-export LUSTER_FOLDER_SERVING_MODEL=<YOUR_LUSTRE_FOLDER_SERVING_MODEL>
+export LUSTRE_FOLDER_SERVING_MODEL=<YOUR_LUSTRE_FOLDER_SERVING_MODEL>
 export LUSTRE_CAPACITY_SERVING_MODEL=<YOUR_LUSTRE_CAPACITY_SERVING_MODEL>
-export LUSTRE_PROJECT_SERVING_MODEL=<YOUR_LUSTRE_PROJECT_ID_SERVING_MODEL>
+export LUSTRE_PROJECT_ID_SERVING_MODEL=<YOUR_LUSTRE_PROJECT_ID_SERVING_MODEL>
 export LUSTRE_LOCATION_SERVING_MODEL=<YOUR_LUSTRE_LOCATION_SERVING_MODEL>
 export LUSTRE_IP_ADDRESS_SERVING_MODEL=<YOUR_LUSTRE_IP_ADDRESS_SERVING_MODEL>
 export LUSTRE_FILE_SYSTEM_SERVING_MODEL=<YOUR_LUSTRE_FILE_SYSTEM_SERVING_MODEL>
 export TRTLLM_VERSION=1.2.0rc2
 
-export PROJECT_ID=supercomputer-testing
-export CLUSTER_REGION=us-central1
-export CLUSTER_NAME=a4x-baker
-export GCS_BUCKET_LOGS=tess-benchmark-outputs
-export LUSTRE_INSTANCE_NAME_SERVING_MODEL=a4x-baker
-export LUSTER_FOLDER_SERVING_MODEL=DeepSeek-R1-NVFP4-v2
-export LUSTRE_CAPACITY_SERVING_MODEL=126000Gi
-export LUSTRE_PROJECT_SERVING_MODEL=supercomputer-testing
-export LUSTRE_LOCATION_SERVING_MODEL=us-central1-b
-export LUSTRE_IP_ADDRESS_SERVING_MODEL=172.21.47.3
-export LUSTRE_FILE_SYSTEM_SERVING_MODEL=lustrefs
-export TRTLLM_VERSION=1.2.0rc2
-
 # Set the project for gcloud commands
 gcloud config set project $PROJECT_ID
 ```
@@ -171,23 +158,23 @@ Replace the following values:
 | `KUEUE_NAME` | The name of the Kueue local queue. The default queue created by the cluster toolkit is `a4x`. Verify the name in your cluster. | `a4x` |
 | `GCS_BUCKET_LOGS` | Name of your GCS logs bucket (do not include `gs://`). | `my-benchmark-logs-bucket` |
 | `LUSTRE_INSTANCE_NAME_SERVING_MODEL` | The name of your Lustre instance. | `my-benchmark-model-lustre` |
-| `LUSTER_FOLDER_SERVING_MODEL` | The path to the GCS model folder on the Lustre. | `my-benchmark-model-folder` |
-| `LUSTRE_CAPACITY_SERVING_MODEL` | The capacity of your Lustre instance. | `my-benchmark-model-folder` |
-| `LUSTRE_PROJECT_SERVING_MODEL` | The project where your Lustre instance resides. | `my-benchmark-model-folder` |
-| `LUSTRE_LOCATION_SERVING_MODEL` | The zonal location of your Lustre instance. | `my-benchmark-model-folder` |
-| `LUSTRE_IP_ADDRESS_SERVING_MODEL` | The IP address of your Lustre instance, it can be obtained from the mountPoint field.  | `my-benchmark-model-folder` |
-| `LUSTRE_FILE_SYSTEM_SERVING_MODEL` | The file system name of your Managed Lustre instance. | `my-benchmark-model-folder` |
+| `LUSTRE_FOLDER_SERVING_MODEL` | The path to the GCS model folder on the Lustre. | `my-benchmark-model-folder` |
+| `LUSTRE_CAPACITY_SERVING_MODEL` | The capacity of your Lustre instance. | `126000Gi` |
+| `LUSTRE_PROJECT_ID_SERVING_MODEL` | The project where your Lustre instance is located. | `gcp-project-12345` |
+| `LUSTRE_LOCATION_SERVING_MODEL` | The zonal location of your Lustre instance. | `us-central1-b` |
+| `LUSTRE_IP_ADDRESS_SERVING_MODEL` | The IP address of your Lustre instance: it can be obtained from the mountPoint field. | `172.21.47.3` |
+| `LUSTRE_FILE_SYSTEM_SERVING_MODEL` | The file system of your Lustre instance. | `lustrefs` |
 | `TRTLLM_VERSION` | The tag/version for the Docker image. Other verions can be found at https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release | `1.2.0rc2` |
 
 To locate your Managed Lustre instance and collect the Lustre instance information, you can run the following command:
 
 ```
 gcloud lustre instances list \
-    --project=${LUSTRE_PROJECT_SERVING_MODEL} \
+    --project=${LUSTRE_PROJECT_ID_SERVING_MODEL} \
     --location=${LUSTRE_LOCATION_SERVING_MODEL}
 ```
 
-The output should look similar to the following. Before you proceed to the next step, make sure to note down the Managed Lustre instance name, filesystem, and the mountPoint fields.
+The output should look similar to the following.
 
 ```
 capacityGib: '9000'
@@ -217,8 +204,8 @@ gcloud container clusters get-credentials $CLUSTER_NAME --region $CLUSTER_REGION
 To download the model from HuggingFace, please follow the steps below:
 
 1.  Follow these [instructions](https://docs.cloud.google.com/managed-lustre/docs/connect-from-compute-engine) to create a compute
-    engine and mount your Lustre instance onto it.
-3.  Access into the mount point on compute engine and create the model folder.
+    engine and mount your Lustre instance on it.
+3.  Access the mount point on the compute engine and create the model folder.
 4.  Under the mount point,
     [download](https://huggingface.co/docs/hub/en/models-downloading) the model
     using the `hf` command:
@@ -239,7 +226,7 @@ instances using the
 ```bash
 helm install -f $REPO_ROOT/src/helm-charts/storage/lustre/values.yaml \
 --set lustreVolumes[0].instance_name=${LUSTRE_INSTANCE_NAME_SERVING_MODEL} \
---set lustreVolumes[0].project_id=${LUSTRE_PROJECT_SERVING_MODEL} \
+--set lustreVolumes[0].project_id=${LUSTRE_PROJECT_ID_SERVING_MODEL} \
 --set lustreVolumes[0].location=${LUSTRE_LOCATION_SERVING_MODEL} \
 --set lustreVolumes[0].ip_address=${LUSTRE_IP_ADDRESS_SERVING_MODEL} \
 --set lustreVolumes[0].capacity=${LUSTRE_CAPACITY_SERVING_MODEL} \
@@ -248,13 +235,6 @@ $USER-lustre-pv-pvc \
 $REPO_ROOT/src/helm-charts/storage/lustre
 ```
 
-<a name="enable-lustre-csi-driver"></a>
-### 3.6 Enable Managed Lustre CSI Driver on an Existing GKE Cluster
-
-If the Managed Lustre CSI driver is not enabled on your GKE cluster, please
-follow these [instructions](https://docs.cloud.google.com/kubernetes-engine/docs/how-to/persistent-volumes/lustre-csi-driver-existing-instance#existing-cluster)
-to enable it.
-
 <a name="run-the-recipe"></a>
 ## 4. Run the recipe
 
@@ -288,7 +268,7 @@ The recipe uses [`trtllm-bench`](https://github.com/NVIDIA/TensorRT-LLM/blob/mai
     --set-file serving_config=$REPO_ROOT/src/frameworks/a4x/trtllm-configs/deepseek-r1-nvfp4.yaml \
     --set queue=${KUEUE_NAME} \
     --set "volumes.gcsMounts[0].bucketName=${GCS_BUCKET_LOGS}" \
-    --set "volumes.pvcMounts[0].subPath=${LUSTER_FOLDER_SERVING_MODEL}" \
+    --set "volumes.pvcMounts[0].subPath=${LUSTRE_FOLDER_SERVING_MODEL}" \
     --set workload.model.name=nvidia/DeepSeek-R1-NVFP4-v2 \
     --set workload.image=nvcr.io/nvidia/tensorrt-llm/release:${TRTLLM_VERSION} \
     $USER-serving-deepseek-r1-model \
diff --git a/inference/a4x/single-host-serving/tensorrt-llm-lustre/values.yaml b/inference/a4x/single-host-serving/tensorrt-llm-lustre/values.yaml
index ea3fb42..ed92413 100644
--- a/inference/a4x/single-host-serving/tensorrt-llm-lustre/values.yaml
+++ b/inference/a4x/single-host-serving/tensorrt-llm-lustre/values.yaml
@@ -57,7 +57,3 @@ network:
   ncclSettings:
     - name: NCCL_DEBUG
       value: "VERSION"
-
-targetNodes:
-  - gke-a4x-baker-a4x-highgpu-4g-a4x-pool-42293301-2hvc
-