Add support to echo the sim's pod name and namespace (#128)

npolshakova · web-flow · commit 471fac0edff6 · 2025-08-18T09:28:10.000+03:00
* add pod name and ns headers

Signed-off-by: npolshakova &lt;nina.polshakova@solo.io&gt;

* add pod name and ns env

Signed-off-by: npolshakova &lt;nina.polshakova@solo.io&gt;

* Signed-off-by: npolshakova &lt;nina.polshakova@solo.io&gt;

feedback

Signed-off-by: npolshakova &lt;nina.polshakova@solo.io&gt;

* reuse env var

Signed-off-by: npolshakova &lt;nina.polshakova@solo.io&gt;

* feedback

Signed-off-by: npolshakova &lt;nina.polshakova@solo.io&gt;

* add unset env tests

Signed-off-by: npolshakova &lt;nina.polshakova@solo.io&gt;

---------

Signed-off-by: npolshakova &lt;nina.polshakova@solo.io&gt;
diff --git a/Makefile b/Makefile
@@ -36,7 +36,7 @@ SRC = $(shell find . -type f -name '*.go')
 help: ## Print help
 	@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n  make \033[36m<target>\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf "  \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)
 
-LDFLAGS ?= -extldflags '-L$(shell pwd)/lib'
+GO_LDFLAGS := -extldflags '-L$(shell pwd)/lib $(LDFLAGS)'
 CGO_ENABLED=1
 TOKENIZER_LIB = lib/libtokenizers.a
 # Extract TOKENIZER_VERSION from Dockerfile
@@ -67,7 +67,7 @@ format: ## Format Go source files
 .PHONY: test
 test: check-ginkgo download-tokenizer download-zmq ## Run tests
 	@printf "\033[33;1m==== Running tests ====\033[0m\n"
-	CGO_ENABLED=1 ginkgo -ldflags="$(LDFLAGS)" -v -r
+	CGO_ENABLED=1 ginkgo -ldflags="$(GO_LDFLAGS)" -v -r
 
 .PHONY: post-deploy-test
 post-deploy-test: ## Run post deployment tests
@@ -84,7 +84,7 @@ lint: check-golangci-lint ## Run lint
 .PHONY: build
 build: check-go download-tokenizer download-zmq 
 	@printf "\033[33;1m==== Building ====\033[0m\n"
-	go build -ldflags="$(LDFLAGS)" -o bin/$(PROJECT_NAME) cmd/$(PROJECT_NAME)/main.go
+	go build -ldflags="$(GO_LDFLAGS)" -o bin/$(PROJECT_NAME) cmd/$(PROJECT_NAME)/main.go
 
 ##@ Container Build/Push
 
diff --git a/README.md b/README.md
@@ -156,6 +156,8 @@ make image-build
 Please note that the default image tag is `ghcr.io/llm-d/llm-d-inference-sim:dev`. <br>
 The following environment variables can be used to change the image tag: `REGISTRY`, `SIM_TAG`, `IMAGE_TAG_BASE` or `IMG`.
 
+Note: On macOS, use `make image-build TARGETOS=linux` to pull the correct base image.
+
 ### Running
 To run the vLLM Simulator image under Docker, run:
 ```bash
@@ -186,6 +188,13 @@ To run the vLLM simulator in a Kubernetes cluster, run:
 kubectl apply -f manifests/deployment.yaml
 ```
 
+When testing locally with kind, build the docker image with `make build-image` then load into the cluster:
+```shell
+kind load --name kind docker-image ghcr.io/llm-d/llm-d-inference-sim:dev
+```
+
+Update the `deployment.yaml` file to use the dev tag. 
+
 To verify the deployment is available, run:
 ```bash
 kubectl get deployment vllm-llama3-8b-instruct
diff --git a/manifests/deployment.yaml b/manifests/deployment.yaml
@@ -25,6 +25,17 @@ spec:
         image: ghcr.io/llm-d/llm-d-inference-sim:latest
         imagePullPolicy: IfNotPresent
         name: vllm-sim
+        env:
+          - name: POD_NAME
+            valueFrom:
+              fieldRef:
+                apiVersion: v1
+                fieldPath: metadata.name
+          - name: POD_NAMESPACE
+            valueFrom:
+              fieldRef:
+                apiVersion: v1
+                fieldPath: metadata.namespace
         ports:
         - containerPort: 8000
           name: http
diff --git a/pkg/llm-d-inference-sim/lora_test.go b/pkg/llm-d-inference-sim/lora_test.go
@@ -37,7 +37,7 @@ var _ = Describe("LoRAs", func() {
 			client, err := startServerWithArgs(ctx, "",
 				[]string{"cmd", "--model", model, "--mode", common.ModeEcho,
 					"--lora-modules", "{\"name\":\"lora3\",\"path\":\"/path/to/lora3\"}",
-					"{\"name\":\"lora4\",\"path\":\"/path/to/lora4\"}"})
+					"{\"name\":\"lora4\",\"path\":\"/path/to/lora4\"}"}, nil)
 			Expect(err).NotTo(HaveOccurred())
 
 			openaiclient := openai.NewClient(
diff --git a/pkg/llm-d-inference-sim/seed_test.go b/pkg/llm-d-inference-sim/seed_test.go
@@ -33,7 +33,7 @@ var _ = Describe("Simulator with seed", func() {
 		func() {
 			ctx := context.TODO()
 			client, err := startServerWithArgs(ctx, common.ModeRandom,
-				[]string{"cmd", "--model", model, "--mode", common.ModeRandom, "--seed", "100"})
+				[]string{"cmd", "--model", model, "--mode", common.ModeRandom, "--seed", "100"}, nil)
 			Expect(err).NotTo(HaveOccurred())
 
 			openaiclient := openai.NewClient(
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
@@ -22,6 +22,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"net"
+	"os"
 	"strings"
 	"sync"
 	"sync/atomic"
@@ -46,6 +47,11 @@ const (
 	textCompletionObject      = "text_completion"
 	chatCompletionObject      = "chat.completion"
 	chatCompletionChunkObject = "chat.completion.chunk"
+
+	podHeader       = "x-inference-pod"
+	namespaceHeader = "x-inference-namespace"
+	podNameEnv      = "POD_NAME"
+	podNsEnv        = "POD_NAMESPACE"
 )
 
 // VllmSimulator simulates vLLM server supporting OpenAI API
@@ -79,6 +85,10 @@ type VllmSimulator struct {
 	toolsValidator *openaiserverapi.Validator
 	// kv cache functionality
 	kvcacheHelper *kvcache.KVCacheHelper
+	// namespace where simulator is running
+	namespace string
+	// pod name of simulator
+	pod string
 }
 
 // New creates a new VllmSimulator instance with the given logger
@@ -93,6 +103,8 @@ func New(logger logr.Logger) (*VllmSimulator, error) {
 		reqChan:        make(chan *openaiserverapi.CompletionReqCtx, 1000),
 		toolsValidator: toolsValidtor,
 		kvcacheHelper:  nil, // kvcache helper will be created only if required after reading configuration
+		namespace:      os.Getenv(podNsEnv),
+		pod:            os.Getenv(podNameEnv),
 	}, nil
 }
 
@@ -599,9 +611,15 @@ func (s *VllmSimulator) sendResponse(isChatCompletion bool, ctx *fasthttp.Reques
 	totalMillisToWait := s.getTimeToFirstToken(doRemotePrefill) + s.getTotalInterTokenLatency(numOfTokens)
 	time.Sleep(time.Duration(totalMillisToWait) * time.Millisecond)
 
-	// TODO - maybe add pod id to response header for testing
 	ctx.Response.Header.SetContentType("application/json")
 	ctx.Response.Header.SetStatusCode(fasthttp.StatusOK)
+	// Add pod and namespace information to response headers for testing/debugging
+	if s.pod != "" {
+		ctx.Response.Header.Add(podHeader, s.pod)
+	}
+	if s.namespace != "" {
+		ctx.Response.Header.Add(namespaceHeader, s.namespace)
+	}
 	ctx.Response.SetBody(data)
 
 	s.responseSentCallback(modelName)
diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go
diff --git a/pkg/llm-d-inference-sim/streaming.go b/pkg/llm-d-inference-sim/streaming.go
diff --git a/pkg/llm-d-inference-sim/tools_test.go b/pkg/llm-d-inference-sim/tools_test.go