Add synchronization of freeing worker after stream reqiest processing (#244)

mayabar · web-flow · commit 658e3e500739 · 2025-10-30T14:08:47.000Z
* add synchronization of freeing worker after stream reqiest processing

Signed-off-by: Maya Barnea &lt;mayab@il.ibm.com&gt;

* additioal changes which fix e2e request latency and inference time calculations for requests in streaming mode

Signed-off-by: Maya Barnea &lt;mayab@il.ibm.com&gt;

---------

Signed-off-by: Maya Barnea &lt;mayab@il.ibm.com&gt;
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
@@ -491,12 +491,8 @@ func (s *VllmSimulator) addRequestToQueue(reqCtx *openaiserverapi.CompletionReqC
 }
 
 // handleCompletions general completion requests handler, support both text and chat completion APIs
+// Importan note: for requests in streaming mode, this function exists before all chunk are sent to the client
 func (s *VllmSimulator) handleCompletions(ctx *fasthttp.RequestCtx, isChatCompletion bool) {
-	startTime := time.Now()
-	defer func() {
-		common.WriteToChannel(s.metrics.e2eReqLatencyChan, time.Since(startTime).Seconds(), s.logger, "metrics.e2eReqLatencyChan")
-	}()
-
 	// Check if we should inject a failure
 	if shouldInjectFailure(s.config, s.random) {
 		failure := getRandomFailure(s.config, s.random)
@@ -526,6 +522,7 @@ func (s *VllmSimulator) handleCompletions(ctx *fasthttp.RequestCtx, isChatComple
 		HTTPReqCtx:       ctx,
 		IsChatCompletion: isChatCompletion,
 		Wg:               &wg,
+		StartProcessing:  time.Now(),
 	}
 	common.WriteToChannel(s.newRequests, reqCtx, s.logger, "newRequests")
 	wg.Wait()
diff --git a/pkg/llm-d-inference-sim/streaming.go b/pkg/llm-d-inference-sim/streaming.go
@@ -21,6 +21,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"strconv"
+	"sync"
 	"time"
 
 	"github.com/llm-d/llm-d-inference-sim/pkg/common"
@@ -47,7 +48,7 @@ type streamingContext struct {
 // response content is wrapped according SSE format
 // First token is send after timeToFirstToken milliseconds, every other token is sent after interTokenLatency milliseconds
 func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, responseTokens []string, toolCalls []openaiserverapi.ToolCall,
-	finishReason string, usageData *openaiserverapi.Usage) {
+	finishReason string, usageData *openaiserverapi.Usage, wg *sync.WaitGroup) {
 	context.ctx.SetContentType("text/event-stream")
 	context.ctx.SetStatusCode(fasthttp.StatusOK)
 
@@ -78,8 +79,9 @@ func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, respons
 					s.sendTokenChunks(context, w, tc.Function.TokenizedArguments, &tc, finishReason)
 				}
 			} else {
-				s.logger.Info("Going to send text", "number of tokens", len(responseTokens))
+				s.logger.V(4).Info("Going to send text", "number of tokens", len(responseTokens))
 				s.sendTokenChunks(context, w, responseTokens, nil, finishReason)
+				s.logger.V(4).Info("Finished sending text", "number of tokens", len(responseTokens))
 			}
 		}
 
@@ -98,6 +100,7 @@ func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, respons
 			return
 		}
 		s.responseSentCallback(context.model, context.isChatCompletion, context.requestID)
+		wg.Done()
 	})
 }
 
diff --git a/pkg/llm-d-inference-sim/test_utils.go b/pkg/llm-d-inference-sim/test_utils.go
@@ -151,11 +151,25 @@ func startServerAndSendRequest(modelName string, prompt string, isStreaming bool
 	client, err := startServerWithArgs(ctx, args)
 	gomega.Expect(err).NotTo(gomega.HaveOccurred())
 
-	openaiclient, params := getOpenAIClientAndChatParams(client, modelName, prompt, isStreaming)
+	openaitextclient, params := getOpenAIClientAndTextParams(client, modelName, prompt, isStreaming)
 
-	// send a single request in a serial way
-	_, err = openaiclient.Chat.Completions.New(ctx, params)
-	gomega.Expect(err).NotTo(gomega.HaveOccurred())
+	if isStreaming {
+		// send a single request in a serial way
+		stream := openaitextclient.Completions.NewStreaming(ctx, params)
+		chunksCnt := 0
+
+		for stream.Next() {
+			chunksCnt++
+		}
+		if err := stream.Err(); err != nil {
+			gomega.Expect(err).NotTo(gomega.HaveOccurred())
+		}
+		// number of chunks is number of tokens + 2 (one chunk with usage info and one closing chunk)
+		gomega.Expect(chunksCnt).To(gomega.BeNumerically("==", len(common.Tokenize(prompt))+2))
+	} else {
+		_, err = openaitextclient.Completions.New(ctx, params)
+		gomega.Expect(err).NotTo(gomega.HaveOccurred())
+	}
 
 	return client
 }
@@ -198,6 +212,22 @@ func getOpenAIClientAndChatParams(client option.HTTPClient, model string, messag
 	return openaiclient, params
 }
 
+// getOpenAIClientAndTextParams - creates an openai client and params for /completions call based on the given parameters
+func getOpenAIClientAndTextParams(client option.HTTPClient, model string, message string, streaming bool) (openai.Client, openai.CompletionNewParams) {
+	openaiclient := openai.NewClient(
+		option.WithBaseURL(baseURL),
+		option.WithHTTPClient(client))
+
+	params := openai.CompletionNewParams{
+		Prompt: openai.CompletionNewParamsPromptUnion{OfString: param.Opt[string]{Value: message}},
+		Model:  openai.CompletionNewParamsModel(model),
+	}
+	if streaming {
+		params.StreamOptions = openai.ChatCompletionStreamOptionsParam{IncludeUsage: param.NewOpt(true)}
+	}
+	return openaiclient, params
+}
+
 // nolint
 // getOpenAIClentAndCompletionParams - creates an openai client and params for /completions call based on the given parameters
 func getOpenAIClentAndCompletionParams(client option.HTTPClient, model string, message string,
diff --git a/pkg/llm-d-inference-sim/worker.go b/pkg/llm-d-inference-sim/worker.go
@@ -19,6 +19,7 @@ package llmdinferencesim
 
 import (
 	"context"
+	"sync"
 	"time"
 
 	"github.com/go-logr/logr"
@@ -49,22 +50,31 @@ func (w *worker) waitForRequests() {
 			w.logger.V(4).Info("worker done", "id", w.id)
 			return
 		case req := <-w.reqChan:
-			w.processor.processRequest(req)
+			w.processor.processRequest(req, nil)
 			w.finishedChan <- &requestCompleted{worker: w, model: req.CompletionReq.GetModel()}
 		}
+
 	}
 }
 
 type requestProcessor interface {
-	processRequest(reqCtx *openaiserverapi.CompletionReqCtx)
+	processRequest(reqCtx *openaiserverapi.CompletionReqCtx, wg *sync.WaitGroup)
 }
 
-func (s *VllmSimulator) processRequest(reqCtx *openaiserverapi.CompletionReqCtx) {
-	start := time.Now()
-	defer func() {
-		common.WriteToChannel(s.metrics.reqInferenceTimeChan, time.Since(start).Seconds(), s.logger, "metrics.reqInferenceTimeChan")
-	}()
+func (s *VllmSimulator) processRequest(reqCtx *openaiserverapi.CompletionReqCtx, _ *sync.WaitGroup) {
+	startTime := time.Now()
+	wg := sync.WaitGroup{}
+	wg.Add(1)
+
+	go s.processRequestAsync(reqCtx, &wg)
+
+	wg.Wait()
+	// calculate inference time and finish e2e latency calculation only when sure that request processing was finished for streaming requests too
+	common.WriteToChannel(s.metrics.e2eReqLatencyChan, time.Since(reqCtx.StartProcessing).Seconds(), s.logger, "metrics.e2eReqLatencyChan")
+	common.WriteToChannel(s.metrics.reqInferenceTimeChan, time.Since(startTime).Seconds(), s.logger, "metrics.reqInferenceTimeChan")
+}
 
+func (s *VllmSimulator) processRequestAsync(reqCtx *openaiserverapi.CompletionReqCtx, wg *sync.WaitGroup) {
 	req := reqCtx.CompletionReq
 	model := req.GetModel()
 	displayModel := s.getDisplayedModelName(model)
@@ -138,14 +148,15 @@ func (s *VllmSimulator) processRequest(reqCtx *openaiserverapi.CompletionReqCtx)
 					// Logprobs configuration
 					logprobs: req.GetLogprobs(),
 				},
-				responseTokens, toolCalls, finishReason, usageDataToSend,
+				responseTokens, toolCalls, finishReason, usageDataToSend, wg,
 			)
 		} else {
 			if req.IsDoRemoteDecode() {
 				// in case this is prefill pod processing, return special finish reason
 				finishReason = dataset.RemoteDecodeFinishReason
 			}
 			s.sendResponse(reqCtx, responseTokens, toolCalls, displayModel, finishReason, &usageData)
+			wg.Done()
 		}
 
 		common.WriteToChannel(s.metrics.requestSuccessChan,
diff --git a/pkg/openai-server-api/request.go b/pkg/openai-server-api/request.go
@@ -19,6 +19,7 @@ package openaiserverapi
 
 import (
 	"sync"
+	"time"
 
 	"github.com/valyala/fasthttp"
 )
@@ -163,6 +164,7 @@ type CompletionReqCtx struct {
 	HTTPReqCtx       *fasthttp.RequestCtx
 	IsChatCompletion bool
 	Wg               *sync.WaitGroup
+	StartProcessing  time.Time
 }
 
 // ChatCompletionRequest defines structure of /chat/completion request

Original file line number	Diff line number	Diff line change
`@@ -21,6 +21,7 @@ import (`
`21`	`21`	`"encoding/json"`
`22`	`22`	`"fmt"`
`23`	`23`	`"strconv"`
	`24`	`+ "sync"`
`24`	`25`	`"time"`
`25`	`26`
`26`	`27`	`"github.com/llm-d/llm-d-inference-sim/pkg/common"`
`@@ -47,7 +48,7 @@ type streamingContext struct {`
`47`	`48`	`// response content is wrapped according SSE format`
`48`	`49`	`// First token is send after timeToFirstToken milliseconds, every other token is sent after interTokenLatency milliseconds`
`49`	`50`	`func (s VllmSimulator) sendStreamingResponse(context streamingContext, responseTokens []string, toolCalls []openaiserverapi.ToolCall,`
`50`		`- finishReason string, usageData *openaiserverapi.Usage) {`
	`51`	`+ finishReason string, usageData openaiserverapi.Usage, wg sync.WaitGroup) {`
`51`	`52`	`context.ctx.SetContentType("text/event-stream")`
`52`	`53`	`context.ctx.SetStatusCode(fasthttp.StatusOK)`
`53`	`54`
`@@ -78,8 +79,9 @@ func (s VllmSimulator) sendStreamingResponse(context streamingContext, respons`
`78`	`79`	`s.sendTokenChunks(context, w, tc.Function.TokenizedArguments, &tc, finishReason)`
`79`	`80`	`}`
`80`	`81`	`} else {`
`81`		`- s.logger.Info("Going to send text", "number of tokens", len(responseTokens))`
	`82`	`+ s.logger.V(4).Info("Going to send text", "number of tokens", len(responseTokens))`
`82`	`83`	`s.sendTokenChunks(context, w, responseTokens, nil, finishReason)`
	`84`	`+ s.logger.V(4).Info("Finished sending text", "number of tokens", len(responseTokens))`
`83`	`85`	`}`
`84`	`86`	`}`
`85`	`87`
`@@ -98,6 +100,7 @@ func (s VllmSimulator) sendStreamingResponse(context streamingContext, respons`
`98`	`100`	`return`
`99`	`101`	`}`
`100`	`102`	`s.responseSentCallback(context.model, context.isChatCompletion, context.requestID)`
	`103`	`+ wg.Done()`
`101`	`104`	`})`
`102`	`105`	`}`
`103`	`106`
Original file line number	Diff line number	Diff line change
`@@ -19,6 +19,7 @@ package openaiserverapi`
`19`	`19`
`20`	`20`	`import (`
`21`	`21`	`"sync"`
	`22`	`+ "time"`
`22`	`23`
`23`	`24`	`"github.com/valyala/fasthttp"`
`24`	`25`	`)`
`@@ -163,6 +164,7 @@ type CompletionReqCtx struct {`
`163`	`164`	`HTTPReqCtx *fasthttp.RequestCtx`
`164`	`165`	`IsChatCompletion bool`
`165`	`166`	`Wg *sync.WaitGroup`
	`167`	`+ StartProcessing time.Time`
`166`	`168`	`}`
`167`	`169`
`168`	`170`	`// ChatCompletionRequest defines structure of /chat/completion request`