Skip to content

Commit fbaa66a

Browse files
committed
feat: add support for X-Request-Id header in responses and logs
Signed-off-by: rudeigerc <[email protected]>
1 parent 063122c commit fbaa66a

File tree

8 files changed

+165
-23
lines changed

8 files changed

+165
-23
lines changed

go.mod

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,7 @@ require (
2121
golang.org/x/sync v0.12.0
2222
gopkg.in/yaml.v3 v3.0.1
2323
k8s.io/klog/v2 v2.130.1
24-
)
25-
26-
require (
27-
github.com/dgraph-io/ristretto/v2 v2.3.0 // indirect
28-
github.com/dustin/go-humanize v1.0.1 // indirect
29-
go.uber.org/multierr v1.11.0 // indirect
24+
sigs.k8s.io/controller-runtime v0.21.0
3025
)
3126

3227
require (
@@ -35,7 +30,9 @@ require (
3530
github.com/cespare/xxhash/v2 v2.3.0 // indirect
3631
github.com/daulet/tokenizers v1.22.1 // indirect
3732
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
33+
github.com/dgraph-io/ristretto/v2 v2.3.0 // indirect
3834
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect
35+
github.com/dustin/go-humanize v1.0.1 // indirect
3936
github.com/emicklei/go-restful/v3 v3.11.0 // indirect
4037
github.com/fxamacker/cbor/v2 v2.7.0 // indirect
4138
github.com/go-openapi/jsonpointer v0.21.0 // indirect
@@ -68,6 +65,7 @@ require (
6865
github.com/vmihailenco/tagparser/v2 v2.0.0 // indirect
6966
github.com/x448/float16 v0.8.4 // indirect
7067
go.uber.org/automaxprocs v1.6.0 // indirect
68+
go.uber.org/multierr v1.11.0 // indirect
7169
golang.org/x/net v0.38.0 // indirect
7270
golang.org/x/oauth2 v0.27.0 // indirect
7371
golang.org/x/sys v0.35.0 // indirect
@@ -83,7 +81,6 @@ require (
8381
k8s.io/client-go v0.33.0 // indirect
8482
k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect
8583
k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 // indirect
86-
sigs.k8s.io/controller-runtime v0.21.0
8784
sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect
8885
sigs.k8s.io/randfill v1.0.0 // indirect
8986
sigs.k8s.io/structured-merge-diff/v4 v4.6.0 // indirect

pkg/common/config.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,9 @@ type Configuration struct {
223223

224224
// EnableSleepMode enables sleep mode
225225
EnableSleepMode bool `yaml:"enable-sleep-mode" json:"enable-sleep-mode"`
226+
227+
// EnableRequestIDHeaders enables including X-Request-Id header in responses
228+
EnableRequestIDHeaders bool `yaml:"enable-request-id-headers" json:"enable-request-id-headers"`
226229
}
227230

228231
type Metrics struct {
@@ -749,6 +752,7 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
749752
f.BoolVar(&config.DatasetInMemory, "dataset-in-memory", config.DatasetInMemory, "Load the entire dataset into memory for faster access")
750753

751754
f.BoolVar(&config.EnableSleepMode, "enable-sleep-mode", config.EnableSleepMode, "Enable sleep mode")
755+
f.BoolVar(&config.EnableRequestIDHeaders, "enable-request-id-headers", config.EnableRequestIDHeaders, "Enable including X-Request-Id header in responses")
752756

753757
f.IntVar(&config.FailureInjectionRate, "failure-injection-rate", config.FailureInjectionRate, "Probability (0-100) of injecting failures")
754758
failureTypes := getParamValueFromArgs("failure-types")

pkg/llm-d-inference-sim/server.go

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,9 +109,20 @@ func (s *VllmSimulator) startServer(ctx context.Context, listener net.Listener)
109109
}
110110
}
111111

112+
// getRequestID retrieves the request ID from the X-Request-Id header or generates a new one if not present
113+
func (s *VllmSimulator) getRequestID(ctx *fasthttp.RequestCtx) string {
114+
if s.config.EnableRequestIDHeaders {
115+
requestID := string(ctx.Request.Header.Peek(requestIDHeader))
116+
if requestID != "" {
117+
return requestID
118+
}
119+
}
120+
return s.random.GenerateUUIDString()
121+
}
122+
112123
// readRequest reads and parses data from the body of the given request according the type defined by isChatCompletion
113124
func (s *VllmSimulator) readRequest(ctx *fasthttp.RequestCtx, isChatCompletion bool) (openaiserverapi.CompletionRequest, error) {
114-
requestID := s.random.GenerateUUIDString()
125+
requestID := s.getRequestID(ctx)
115126

116127
if isChatCompletion {
117128
var req openaiserverapi.ChatCompletionRequest
@@ -266,6 +277,11 @@ func (s *VllmSimulator) sendCompletionResponse(ctx *fasthttp.RequestCtx, resp op
266277
if s.namespace != "" {
267278
ctx.Response.Header.Add(namespaceHeader, s.namespace)
268279
}
280+
if s.config.EnableRequestIDHeaders {
281+
if requestID := resp.GetRequestID(); requestID != "" {
282+
ctx.Response.Header.Add(requestIDHeader, requestID)
283+
}
284+
}
269285
ctx.Response.SetBody(data)
270286
}
271287

pkg/llm-d-inference-sim/server_test.go

Lines changed: 109 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,13 @@ import (
2525
"strings"
2626
"time"
2727

28+
. "github.com/onsi/ginkgo/v2"
29+
. "github.com/onsi/gomega"
30+
"github.com/valyala/fasthttp"
31+
2832
"github.com/llm-d/llm-d-inference-sim/pkg/common"
2933
kvcache "github.com/llm-d/llm-d-inference-sim/pkg/kv-cache"
3034
vllmapi "github.com/llm-d/llm-d-inference-sim/pkg/vllm-api"
31-
. "github.com/onsi/ginkgo/v2"
32-
. "github.com/onsi/gomega"
3335
)
3436

3537
const tmpDir = "./tests-tmp/"
@@ -212,6 +214,111 @@ var _ = Describe("Server", func() {
212214

213215
})
214216

217+
Context("request ID headers", func() {
218+
testRequestIDHeader := func(enableRequestID bool, endpoint, reqBody, inputRequestID string, expectRequestID *string, validateBody func([]byte)) {
219+
ctx := context.TODO()
220+
args := []string{"cmd", "--model", testModel, "--mode", common.ModeEcho}
221+
if enableRequestID {
222+
args = append(args, "--enable-request-id-headers")
223+
}
224+
client, err := startServerWithArgs(ctx, args)
225+
Expect(err).NotTo(HaveOccurred())
226+
227+
req, err := http.NewRequest("POST", "http://localhost"+endpoint, strings.NewReader(reqBody))
228+
Expect(err).NotTo(HaveOccurred())
229+
req.Header.Set(fasthttp.HeaderContentType, "application/json")
230+
if inputRequestID != "" {
231+
req.Header.Set(requestIDHeader, inputRequestID)
232+
}
233+
234+
resp, err := client.Do(req)
235+
Expect(err).NotTo(HaveOccurred())
236+
defer func() {
237+
err := resp.Body.Close()
238+
Expect(err).NotTo(HaveOccurred())
239+
}()
240+
241+
Expect(resp.StatusCode).To(Equal(http.StatusOK))
242+
243+
if expectRequestID != nil {
244+
actualRequestID := resp.Header.Get(requestIDHeader)
245+
if *expectRequestID != "" {
246+
// When a request ID is provided, it should be echoed back
247+
Expect(actualRequestID).To(Equal(*expectRequestID))
248+
} else {
249+
// When no request ID is provided, a UUID should be generated
250+
Expect(actualRequestID).NotTo(BeEmpty())
251+
Expect(len(actualRequestID)).To(BeNumerically(">", 30))
252+
}
253+
} else {
254+
// When request ID headers are disabled, the header should be empty
255+
Expect(resp.Header.Get(requestIDHeader)).To(BeEmpty())
256+
}
257+
258+
if validateBody != nil {
259+
body, err := io.ReadAll(resp.Body)
260+
Expect(err).NotTo(HaveOccurred())
261+
validateBody(body)
262+
}
263+
}
264+
265+
DescribeTable("request ID behavior",
266+
testRequestIDHeader,
267+
Entry("includes X-Request-Id when enabled",
268+
true,
269+
"/v1/chat/completions",
270+
`{"messages": [{"role": "user", "content": "Hello"}], "model": "`+testModel+`", "max_tokens": 5}`,
271+
"test-request-id-123",
272+
ptr("test-request-id-123"),
273+
nil,
274+
),
275+
Entry("excludes X-Request-Id when disabled",
276+
false,
277+
"/v1/chat/completions",
278+
`{"messages": [{"role": "user", "content": "Hello"}], "model": "`+testModel+`", "max_tokens": 5}`,
279+
"test-request-id-456",
280+
nil,
281+
nil,
282+
),
283+
Entry("includes X-Request-Id in streaming response",
284+
true,
285+
"/v1/chat/completions",
286+
`{"messages": [{"role": "user", "content": "Hello"}], "model": "`+testModel+`", "max_tokens": 5, "stream": true}`,
287+
"test-streaming-789",
288+
ptr("test-streaming-789"),
289+
nil,
290+
),
291+
Entry("works with text completions endpoint",
292+
true,
293+
"/v1/completions",
294+
`{"prompt": "Hello world", "model": "`+testModel+`", "max_tokens": 5}`,
295+
"text-request-111",
296+
ptr("text-request-111"),
297+
nil,
298+
),
299+
Entry("generates UUID when no request ID provided",
300+
true,
301+
"/v1/chat/completions",
302+
`{"messages": [{"role": "user", "content": "Hello"}], "model": "`+testModel+`", "max_tokens": 5}`,
303+
"",
304+
ptr(""),
305+
nil,
306+
),
307+
Entry("uses request ID in response body ID field",
308+
true,
309+
"/v1/chat/completions",
310+
`{"messages": [{"role": "user", "content": "Hello"}], "model": "`+testModel+`", "max_tokens": 5}`,
311+
"body-test-999",
312+
ptr("body-test-999"),
313+
func(body []byte) {
314+
var resp map[string]any
315+
Expect(json.Unmarshal(body, &resp)).To(Succeed())
316+
Expect(resp["id"]).To(Equal("chatcmpl-body-test-999"))
317+
},
318+
),
319+
)
320+
})
321+
215322
Context("sleep mode", Ordered, func() {
216323
AfterAll(func() {
217324
err := os.RemoveAll(tmpDir)

pkg/llm-d-inference-sim/simulator.go

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ const (
5151
podHeader = "x-inference-pod"
5252
portHeader = "x-inference-port"
5353
namespaceHeader = "x-inference-namespace"
54+
requestIDHeader = "X-Request-Id"
5455
podNameEnv = "POD_NAME"
5556
podNsEnv = "POD_NAMESPACE"
5657
)
@@ -581,9 +582,9 @@ func (s *VllmSimulator) responseSentCallback(model string, isChatCompletion bool
581582
// modelName - display name returned to the client and used in metrics. It is either the first alias
582583
// from --served-model-name (for a base-model request) or the LoRA adapter name (for a LoRA request).
583584
func (s *VllmSimulator) createCompletionResponse(logprobs *int, isChatCompletion bool, respTokens []string, toolCalls []openaiserverapi.ToolCall,
584-
finishReason *string, usageData *openaiserverapi.Usage, modelName string, doRemoteDecode bool) openaiserverapi.CompletionResponse {
585-
baseResp := openaiserverapi.CreateBaseCompletionResponse(chatComplIDPrefix+s.random.GenerateUUIDString(),
586-
time.Now().Unix(), modelName, usageData)
585+
finishReason *string, usageData *openaiserverapi.Usage, modelName string, doRemoteDecode bool, requestID string) openaiserverapi.CompletionResponse {
586+
baseResp := openaiserverapi.CreateBaseCompletionResponse(chatComplIDPrefix+requestID,
587+
time.Now().Unix(), modelName, usageData, requestID)
587588

588589
if doRemoteDecode {
589590
baseResp.KVParams = &openaiserverapi.KVTransferParams{}
@@ -663,9 +664,10 @@ func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, r
663664
if toolCalls == nil {
664665
logprobs = reqCtx.CompletionReq.GetLogprobs()
665666
}
667+
requestID := reqCtx.CompletionReq.GetRequestID()
666668

667669
resp := s.createCompletionResponse(logprobs, reqCtx.IsChatCompletion, respTokens, toolCalls, &finishReason, usageData, modelName,
668-
reqCtx.CompletionReq.IsDoRemoteDecode())
670+
reqCtx.CompletionReq.IsDoRemoteDecode(), requestID)
669671

670672
// calculate how long to wait before returning the response, time is based on number of tokens
671673
nCachedPromptTokens := reqCtx.CompletionReq.GetNumberOfCachedPromptTokens()

pkg/llm-d-inference-sim/streaming.go

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,9 @@ func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, respons
6060
if s.namespace != "" {
6161
context.ctx.Response.Header.Add(namespaceHeader, s.namespace)
6262
}
63+
if s.config.EnableRequestIDHeaders {
64+
context.ctx.Response.Header.Add(requestIDHeader, context.requestID)
65+
}
6366

6467
context.ctx.SetBodyStreamWriter(func(w *bufio.Writer) {
6568
context.creationTime = time.Now().Unix()
@@ -176,8 +179,8 @@ func (s *VllmSimulator) sendTokenChunks(context *streamingContext, w *bufio.Writ
176179
// createUsageChunk creates and returns a CompletionRespChunk with usage data, a single chunk of streamed completion API response,
177180
// supports both modes (text and chat)
178181
func (s *VllmSimulator) createUsageChunk(context *streamingContext, usageData *openaiserverapi.Usage) openaiserverapi.CompletionRespChunk {
179-
baseChunk := openaiserverapi.CreateBaseCompletionResponse(chatComplIDPrefix+s.random.GenerateUUIDString(),
180-
context.creationTime, context.model, usageData)
182+
baseChunk := openaiserverapi.CreateBaseCompletionResponse(chatComplIDPrefix+context.requestID,
183+
context.creationTime, context.model, usageData, context.requestID)
181184

182185
if context.isChatCompletion {
183186
baseChunk.Object = chatCompletionChunkObject
@@ -191,8 +194,8 @@ func (s *VllmSimulator) createUsageChunk(context *streamingContext, usageData *o
191194
// createTextCompletionChunk creates and returns a CompletionRespChunk, a single chunk of streamed completion API response,
192195
// for text completion.
193196
func (s *VllmSimulator) createTextCompletionChunk(context *streamingContext, token string, finishReason *string) openaiserverapi.CompletionRespChunk {
194-
baseChunk := openaiserverapi.CreateBaseCompletionResponse(chatComplIDPrefix+s.random.GenerateUUIDString(),
195-
context.creationTime, context.model, nil)
197+
baseChunk := openaiserverapi.CreateBaseCompletionResponse(chatComplIDPrefix+context.requestID,
198+
context.creationTime, context.model, nil, context.requestID)
196199
baseChunk.Object = textCompletionObject
197200

198201
choice := openaiserverapi.CreateTextRespChoice(openaiserverapi.CreateBaseResponseChoice(0, finishReason), token)
@@ -214,8 +217,8 @@ func (s *VllmSimulator) createTextCompletionChunk(context *streamingContext, tok
214217
// API response, for chat completion. It sets either role, or token, or tool call info in the message.
215218
func (s *VllmSimulator) createChatCompletionChunk(context *streamingContext, token string, tool *openaiserverapi.ToolCall,
216219
role string, finishReason *string) openaiserverapi.CompletionRespChunk {
217-
baseChunk := openaiserverapi.CreateBaseCompletionResponse(chatComplIDPrefix+s.random.GenerateUUIDString(),
218-
context.creationTime, context.model, nil)
220+
baseChunk := openaiserverapi.CreateBaseCompletionResponse(chatComplIDPrefix+context.requestID,
221+
context.creationTime, context.model, nil, context.requestID)
219222
baseChunk.Object = chatCompletionChunkObject
220223
chunk := openaiserverapi.CreateChatCompletionRespChunk(baseChunk,
221224
[]openaiserverapi.ChatRespChunkChoice{

pkg/llm-d-inference-sim/test_utils.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -541,3 +541,7 @@ func checkSimSleeping(client *http.Client, expectedToSleep bool) {
541541
expect := fmt.Sprintf("{\"is_sleeping\":%t}", expectedToSleep)
542542
gomega.Expect(string(body)).To(gomega.Equal(expect))
543543
}
544+
545+
func ptr[T any](v T) *T {
546+
return &v
547+
}

pkg/openai-server-api/response.go

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,9 @@ import (
2626
)
2727

2828
// CompletionResponse interface representing both completion response types (text and chat)
29-
type CompletionResponse interface{}
29+
type CompletionResponse interface {
30+
GetRequestID() string
31+
}
3032

3133
// baseCompletionResponse contains base completion response related information
3234
type baseCompletionResponse struct {
@@ -42,6 +44,8 @@ type baseCompletionResponse struct {
4244
Object string `json:"object"`
4345
// KVParams kv transfer related fields
4446
KVParams *KVTransferParams `json:"kv_transfer_params"`
47+
// RequestID is the unique request ID for tracking
48+
RequestID string `json:"-"`
4549
}
4650

4751
// Usage contains token Usage statistics
@@ -303,8 +307,13 @@ func CreateTextRespChoice(base baseResponseChoice, text string) TextRespChoice {
303307
return TextRespChoice{baseResponseChoice: base, Text: text, Logprobs: nil}
304308
}
305309

306-
func CreateBaseCompletionResponse(id string, created int64, model string, usage *Usage) baseCompletionResponse {
307-
return baseCompletionResponse{ID: id, Created: created, Model: model, Usage: usage}
310+
func CreateBaseCompletionResponse(id string, created int64, model string, usage *Usage, requestID string) baseCompletionResponse {
311+
return baseCompletionResponse{ID: id, Created: created, Model: model, Usage: usage, RequestID: requestID}
312+
}
313+
314+
// GetRequestID returns the request ID from the response
315+
func (b baseCompletionResponse) GetRequestID() string {
316+
return b.RequestID
308317
}
309318

310319
func CreateChatCompletionResponse(base baseCompletionResponse, choices []ChatRespChoice) *ChatCompletionResponse {

0 commit comments

Comments
 (0)