Copybara import of gpu-recipes:

Copybara · Copybara · commit eeadb349781f · 2025-02-25T15:47:41.000Z
- d5a7868d51785b830fa853506437141b120e57b5 Multi node inference of DeepSeek R1 on A3Mega (16xH100)

GitOrigin-RevId: d5a7868d51785b830fa853506437141b120e57b5
diff --git a/README.md b/README.md
@@ -32,11 +32,16 @@ Welcome to the reproducible benchmark recipes repository for GPUs! This reposito
 | ---------------- | ---------------- | --------- | ------------------- | ------------ | ------------------ |
 | **Llama-3.1-70B**     | [A3 Ultra (NVIDIA H200)](https://cloud.google.com/compute/docs/accelerator-optimized-machines#a3-ultra-vms)    | MaxText  | Pre-training   | GKE          | [Link](./training/a3ultra/llama-3.1-70b/maxtext-pretraining-gke/README.md)
 | **Llama-3.1-70B**     | [A3 Ultra (NVIDIA H200)](https://cloud.google.com/compute/docs/accelerator-optimized-machines#a3-ultra-vms)    | NeMo  | Pre-training   | GKE          | [Link](./training/a3ultra/llama-3.1-70b/nemo-pretraining-gke/README.md)
-| **Llama-3.1-405B**     | [A3 Ultra (NVIDIA H200)](https://cloud.google.com/compute/docs/accelerator-optimized-machines#a3-ultra-vms)    | NeMo  | Pre-training   | GKE          | [Link](./training/a3ultra/llama-3.1-405b/nemo-pretraining-gke/README.md)
 | **Mixtral-8-7B**     | [A3 Ultra (NVIDIA H200)](https://cloud.google.com/compute/docs/accelerator-optimized-machines#a3-ultra-vms)    | MaxText  | Pre-training   | GKE          | [Link](./training/a3ultra/mixtral-8x7b/maxtext-pretraining-gke/README.md)
 | **Mixtral-8-7B**     | [A3 Ultra (NVIDIA H200)](https://cloud.google.com/compute/docs/accelerator-optimized-machines#a3-ultra-vms)    | NeMo  | Pre-training   | GKE          | [Link](./training/a3ultra/mixtral-8x7b/nemo-pretraining-gke/README.md)            |
 
 
+### Inference benchmarks A3 Mega
+
+| Models           | GPU Machine Type | Framework | Workload Type       | Orchestrator | Link to the recipe |
+| ---------------- | ---------------- | --------- | ------------------- | ------------ | ------------------ |
+| **DeepSeek R1 671B**     | [A3 Mega (NVIDIA H100)](https://cloud.google.com/compute/docs/accelerator-optimized-machines#a3-mega-vms)    | SGLang  | Inference   | GKE          | [Link](./inference/a3mega/deepseek-r1-671b/sglang-serving-gke/README.md)
+
 ### Inference benchmarks A3 Ultra
 
 | Models           | GPU Machine Type | Framework | Workload Type       | Orchestrator | Link to the recipe |
diff --git a/docs/configuring-environment-gke-a3-ultra.md b/docs/configuring-environment-gke-a3-ultra.md
@@ -73,13 +73,14 @@ Replace the following:
 
 Add IAM binding to allow workloads authenticated via a workload identity (with the default service account) to access Cloud Storage objects.
 
-    ```bash
+   ```bash
    PROJECT_NUMBER=$(gcloud projects describe $PROJECT_ID --format="value(projectNumber)")
    gcloud storage buckets add-iam-policy-binding gs://<BUCKET_NAME> \
-      --role=roles/storage.objectUser \
-      --member=principal://iam.googleapis.com/projects/$PROJECT_NUMBER/locations/global/workloadIdentityPools/$PROJECT_ID.svc.id.goog/subject/ns/default/sa/default \
-      --condition=None
+   --role=roles/storage.objectUser \
+   --member=principal://iam.googleapis.com/projects/$PROJECT_NUMBER/locations/global/workloadIdentityPools/$PROJECT_ID.svc.id.goog/subject/ns/default/sa/default \
+   --condition=None
    ```
+
 Replace the following:
 
 - `BUCKET_NAME`: the name of your bucket created in the previous step
diff --git a/inference/a3mega/deepseek-r1-671b/sglang-serving-gke/README.md b/inference/a3mega/deepseek-r1-671b/sglang-serving-gke/README.md
diff --git a/inference/a3mega/deepseek-r1-671b/sglang-serving-gke/stream_chat.sh b/inference/a3mega/deepseek-r1-671b/sglang-serving-gke/stream_chat.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+[ $# -eq 0 ] && {
+    echo "Error: No prompt provided."
+    echo "Usage: $0 \"Your prompt here\""
+    exit 1
+}
+
+start_time=$(date +%s.%N)
+temp_file="/tmp/temp_response.txt"
+
+# format JSON payload to send to the model with streaming enabled
+json_payload=$(jq -n \
+    --arg prompt "$1" \
+    '{
+        model: "default",
+        messages: [
+            {role: "system", content: "You are a helpful AI assistant"},
+            {role: "user", content: $prompt}
+        ],
+        temperature: 0.6,
+        top_p: 0.95,
+        max_tokens: 2048,
+        stream: true
+    }')
+
+echo "Streaming response:"
+echo "----------------"
+
+# Send the request to the model and stream the response
+curl -sN "http://localhost:30000/v1/chat/completions" \
+    -H "Content-Type: application/json" \
+    -d "$json_payload" | while IFS= read -r line; do
+    [[ -z $line ]] && continue
+
+    line=${line#data: }
+    [[ $line == "[DONE]" ]] && continue
+
+    content=$(jq -r '.choices[0].delta.content // empty' <<< "$line")
+    [[ -n $content ]] && {
+        echo -n "$content"
+        echo -n "$content" >> "$temp_file"
+    }
+done
+
+echo -e "\n\n----------------"
+
+[[ ! -s $temp_file ]] && {
+    echo "Error: No response received from the API or an error occurred during streaming." >&2
+    rm -f "$temp_file"
+    exit 1
+}
+
+# Parse the response and extract the reasoning and final answer
+full_content=$(<"$temp_file")
+
+[[ $full_content =~ \<think\>([[:print:][:space:]]*)\</think\> ]] && \
+    reasoning="${BASH_REMATCH[1]}" || reasoning=""
+
+final_answer=$(sed 's/.*<\/think>//; s/^[[:space:]]*//; s/[[:space:]]*$//' <<< "$full_content")
+
+execution_time=$(bc <<< "$(date +%s.%N) - $start_time")
+
+echo -e "\nParsed Results:"
+echo "----------------"
+echo -e "Reasoning:\n$reasoning"
+echo -e "\nFinal Answer:\n$final_answer"
+echo -e "\nExecution time: $execution_time seconds"
+
+rm "$temp_file"
diff --git a/inference/a3mega/deepseek-r1-671b/sglang-serving-gke/values.yaml b/inference/a3mega/deepseek-r1-671b/sglang-serving-gke/values.yaml
@@ -0,0 +1,60 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+targetPlatform: "gke"
+
+clusterName:
+queue:
+
+huggingface:
+  secretName: hf-secret
+  secretData:
+    token: "hf_api_token"
+
+model:
+  name: deepseek-ai/DeepSeek-R1
+  tp_size: 16
+  pp_size: 1
+
+job:
+  image:
+    repository:
+    tag:
+  gpus: 16
+
+volumes:
+  ssdMountPath: "/ssd"
+  gcsMounts:
+    - bucketName:
+      mountPath: "/gcs"
+
+gpuPlatformSettings:
+  useHostPlugin: false
+  ncclPluginImage: "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/nccl-plugin-gpudirecttcpx-dev:v1.0.8-1"
+  rxdmImage: "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.14"
+  ncclBuildType: 223
+
+network:
+  ncclSettings:
+  - name: NCCL_DEBUG
+    value: "INFO"
+  subnetworks[]:
+
+sglang:
+  replicaCount: 1
+
+  service:
+    type: ClusterIP
+    ports:
+      http: 30000
diff --git a/src/docker/sglang/sglang.Dockerfile b/src/docker/sglang/sglang.Dockerfile
@@ -23,6 +23,7 @@ RUN apt update && apt install --yes --no-install-recommends \
     curl \
     gnupg \
     cmake \
+    dnsutils \
   && echo "deb https://packages.cloud.google.com/apt gcsfuse-buster main" \
     | tee /etc/apt/sources.list.d/gcsfuse.list \
   && echo "deb https://packages.cloud.google.com/apt cloud-sdk main" \
diff --git a/src/helm-charts/a3mega/sglang-inference/Chart.yaml b/src/helm-charts/a3mega/sglang-inference/Chart.yaml
@@ -0,0 +1,20 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v2
+name: sglang-deepseek-r1-671b-inference
+description: sglang-deepseek-r1-671b-inference
+type: application
+version: 0.1.0
+appVersion: "1.16.0"
diff --git a/src/helm-charts/a3mega/sglang-inference/templates/lws-deployment-svc.yaml b/src/helm-charts/a3mega/sglang-inference/templates/lws-deployment-svc.yaml
@@ -0,0 +1,29 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Service
+metadata:
+  name: {{ .Release.Name }}-svc
+spec:
+  selector:
+    app: {{ .Release.Name }}-serving
+  ports:
+    - name: http
+      port: {{ .Values.sglang.service.ports.http }}
+      targetPort: {{ .Values.sglang.service.ports.http }}
+  selector:
+    leaderworkerset.sigs.k8s.io/name: {{ .Release.Name }}
+    role: leader
+  type: {{ .Values.sglang.service.type }}
diff --git a/src/helm-charts/a3mega/sglang-inference/templates/lws-deployment.yaml b/src/helm-charts/a3mega/sglang-inference/templates/lws-deployment.yaml
diff --git a/training/a3mega/llama-2-7b/maxtext-pretraining-gke/README.md b/training/a3mega/llama-2-7b/maxtext-pretraining-gke/README.md