AI-Hypercomputer
diff --git a/‎README.md‎
Lines changed: 2 additions & 0 deletions b/‎README.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎inference/a3mega/deepseek-r1-671b/sglang-serving-gke/README.md‎
Lines changed: 4 additions & 4 deletions b/‎inference/a3mega/deepseek-r1-671b/sglang-serving-gke/README.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎inference/a3mega/deepseek-r1-671b/vllm-serving-gke/README.md‎
Lines changed: 370 additions & 0 deletions b/‎inference/a3mega/deepseek-r1-671b/vllm-serving-gke/README.md‎
Lines changed: 370 additions & 0 deletions
diff --git a/‎inference/a3mega/deepseek-r1-671b/vllm-serving-gke/stream_chat.sh‎
Lines changed: 85 additions & 0 deletions b/‎inference/a3mega/deepseek-r1-671b/vllm-serving-gke/stream_chat.sh‎
Lines changed: 85 additions & 0 deletions
diff --git a/‎inference/a3mega/deepseek-r1-671b/vllm-serving-gke/values.yaml‎
Lines changed: 60 additions & 0 deletions b/‎inference/a3mega/deepseek-r1-671b/vllm-serving-gke/values.yaml‎
Lines changed: 60 additions & 0 deletions
@@ -41,13 +41,15 @@ Welcome to the reproducible benchmark recipes repository for GPUs! This reposito
 | Models           | GPU Machine Type | Framework | Workload Type       | Orchestrator | Link to the recipe |
 | ---------------- | ---------------- | --------- | ------------------- | ------------ | ------------------ |
 | **DeepSeek R1 671B**     | [A3 Mega (NVIDIA H100)](https://cloud.google.com/compute/docs/accelerator-optimized-machines#a3-mega-vms)    | SGLang  | Inference   | GKE          | [Link](./inference/a3mega/deepseek-r1-671b/sglang-serving-gke/README.md)
+| **DeepSeek R1 671B**     | [A3 Mega (NVIDIA H100)](https://cloud.google.com/compute/docs/accelerator-optimized-machines#a3-mega-vms)    | vLLM  | Inference   | GKE          | [Link](./inference/a3mega/deepseek-r1-671b/vllm-serving-gke/README.md)
 
 ### Inference benchmarks A3 Ultra
 
 | Models           | GPU Machine Type | Framework | Workload Type       | Orchestrator | Link to the recipe |
 | ---------------- | ---------------- | --------- | ------------------- | ------------ | ------------------ |
 | **Llama-3.1-405B**     | [A3 Ultra (NVIDIA H200)](https://cloud.google.com/compute/docs/accelerator-optimized-machines#a3-ultra-vms)    | TensorRT-LLM  | Inference   | GKE          | [Link](./inference/a3ultra/llama-3.1-405b/trtllm-inference-gke/single-node/README.md)
 | **DeepSeek R1 671B**     | [A3 Ultra (NVIDIA H200)](https://cloud.google.com/compute/docs/accelerator-optimized-machines#a3-ultra-vms)    | SGLang  | Inference   | GKE          | [Link](./inference/a3ultra/deepseek-r1-671b/sglang-serving-gke/README.md)
+| **DeepSeek R1 671B**     | [A3 Ultra (NVIDIA H200)](https://cloud.google.com/compute/docs/accelerator-optimized-machines#a3-ultra-vms)    | vLLM  | Inference   | GKE          | [Link](./inference/a3ultra/deepseek-r1-671b/vllm-serving-gke/README.md)
 
 
 ## Repository structure
 
@@ -187,12 +187,12 @@ The recipe uses the helm chart to run the above steps.
 
 4. To view the logs for the deployment, you can run
     ```bash
-    kubectl logs -f job/$USER-serving-deepseek-r1-model
+    kubectl logs -f service/$USER-serving-deepseek-r1-model-svc
     ```
 
 5. Verify if the deployment has started by running
     ```bash
-    kubectl get deployment/$USER-serving-deepseek-r1-model
+    kubectl get service/$USER-serving-deepseek-r1-model-svc
     ```
 
 6. Once the deployment has started, you will see logs similar to this:
@@ -275,9 +275,9 @@ The recipe uses the helm chart to run the above steps.
     ./stream_chat.sh "Which is bigger 9.9 or 9.11 ?"
     ```
 
-10. To run benchmarks for inference, you can use the default benchamrking tool from SGLang like this
+10. To run benchmarks for inference, you can use the default benchmarking tool from SGLang like this
     ```bash
-    kubectl exec -it $USER-serving-deepseek-r1-model-0 -- /bin/bash -c "python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-range-ratio 1 --num-prompt 1100 --random-input 1000 --random-output 1000 --host 0.0.0.0 --port 30000 --output-file /gcs/benchmark_logs/sglang/ds_1000_1000_1100_output.jsonl"
+    kubectl exec -it service/$USER-serving-deepseek-r1-model-svc -- /bin/bash -c "python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-range-ratio 1 --num-prompt 1100 --random-input 1000 --random-output 1000 --host 0.0.0.0 --port 30000 --output-file /gcs/benchmark_logs/sglang/ds_1000_1000_1100_output.jsonl"
     ```
 
     Once the benchmark is done, you can find the results in the GCS Bucket. You should see logs similar to this:
 
@@ -0,0 +1,85 @@
+#!/bin/bash
+
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+[ $# -eq 0 ] && {
+    echo "Error: No prompt provided."
+    echo "Usage: $0 \"Your prompt here\""
+    exit 1
+}
+
+start_time=$(date +%s.%N)
+temp_file="/tmp/temp_response.txt"
+
+# format JSON payload to send to the model with streaming enabled
+json_payload=$(jq -n \
+    --arg prompt "$1" \
+    '{
+        model: "deepseek-ai/DeepSeek-R1",
+        messages: [
+            {role: "system", content: "You are a helpful AI assistant"},
+            {role: "user", content: $prompt}
+        ],
+        temperature: 0.6,
+        top_p: 0.95,
+        max_tokens: 2048,
+        stream: true
+    }')
+
+echo "Streaming response:"
+echo "----------------"
+
+# Send the request to the model and stream the response
+curl -sN "http://localhost:8000/v1/chat/completions" \
+    -H "Content-Type: application/json" \
+    -d "$json_payload" | while IFS= read -r line; do
+    [[ -z $line ]] && continue
+
+    line=${line#data: }
+    [[ $line == "[DONE]" ]] && continue
+
+    content=$(jq -r '.choices[0].delta.content // empty' <<< "$line")
+    [[ -n $content ]] && {
+        echo -n "$content"
+        echo -n "$content" >> "$temp_file"
+    }
+done
+
+echo -e "\n\n----------------"
+
+[[ ! -s $temp_file ]] && {
+    echo "Error: No response received from the API or an error occurred during streaming." >&2
+    rm -f "$temp_file"
+    exit 1
+}
+
+# Parse the response and extract the reasoning and final answer
+full_content=$(<"$temp_file")
+
+[[ $full_content =~ \<think\>([[:print:][:space:]]*)\</think\> ]] && \
+    reasoning="${BASH_REMATCH[1]}" || reasoning=""
+
+final_answer=$(sed 's/.*<\/think>//; s/^[[:space:]]*//; s/[[:space:]]*$//' <<< "$full_content")
+
+execution_time=$(bc <<< "$(date +%s.%N) - $start_time")
+
+echo -e "\nParsed Results:"
+echo "----------------"
+echo -e "Reasoning:\n$reasoning"
+echo -e "\nFinal Answer:\n$final_answer"
+echo -e "\nExecution time: $execution_time seconds"
+
+rm "$temp_file"
@@ -0,0 +1,60 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+targetPlatform: "gke"
+
+clusterName:
+queue:
+
+huggingface:
+  secretName: hf-secret
+  secretData:
+    token: "hf_api_token"
+
+model:
+  name: deepseek-ai/DeepSeek-R1
+  tp_size: 8
+  pp_size: 2
+
+job:
+  image:
+    repository:
+    tag:
+  gpus: 16
+
+volumes:
+  ssdMountPath: "/ssd"
+  gcsMounts:
+    - bucketName:
+      mountPath: "/gcs"
+
+gpuPlatformSettings:
+  useHostPlugin: false
+  ncclPluginImage: "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/nccl-plugin-gpudirecttcpx-dev:v1.0.8-1"
+  rxdmImage: "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.14"
+  ncclBuildType: 223
+
+network:
+  ncclSettings:
+  - name: NCCL_DEBUG
+    value: "VERSION"
+  subnetworks[]:
+
+vllm:
+  replicaCount: 1
+
+  service:
+    type: ClusterIP
+    ports:
+      http: 8000