Make writing to channels non-blocking (#225)

irar2 · web-flow · commit 64d8d7f89eec · 2025-10-23T08:38:08.000Z
* Made writing to channels non-blocking

Signed-off-by: irar2 &lt;irar@il.ibm.com&gt;

* Lint

Signed-off-by: irar2 &lt;irar@il.ibm.com&gt;

---------

Signed-off-by: irar2 &lt;irar@il.ibm.com&gt;
diff --git a/pkg/common/utils.go b/pkg/common/utils.go
@@ -21,6 +21,7 @@ import (
 	"regexp"
 	"sync"
 
+	"github.com/go-logr/logr"
 	"github.com/google/uuid"
 )
 
@@ -127,3 +128,11 @@ func init() {
 func Tokenize(text string) []string {
 	return re.FindAllString(text, -1)
 }
+
+func WriteToChannel[T any](channel chan T, object T, logger logr.Logger, channelName string) {
+	select {
+	case channel <- object:
+	default:
+		logger.V(1).Info("failed to write to", "channel", channelName)
+	}
+}
diff --git a/pkg/kv-cache/block_cache.go b/pkg/kv-cache/block_cache.go
@@ -70,10 +70,10 @@ func newBlockCache(config *common.Configuration, logger logr.Logger, usageChan c
 	}, nil
 }
 
-func (b *blockCache) start(ctx context.Context) {
-	err := b.eventSender.Run(ctx)
+func (bc *blockCache) start(ctx context.Context) {
+	err := bc.eventSender.Run(ctx)
 	if err != nil {
-		b.logger.Info("sender stopped with error", "error", err)
+		bc.logger.Info("sender stopped with error", "error", err)
 	}
 }
 
@@ -139,20 +139,25 @@ func (bc *blockCache) startRequest(requestID string, blocks []uint64) (int, erro
 			}
 
 			delete(bc.unusedBlocks, oldestUnusedHash)
-			bc.eventChan <- EventData{action: eventActionRemove, hashValues: []uint64{oldestUnusedHash}}
+			common.WriteToChannel(bc.eventChan,
+				EventData{action: eventActionRemove, hashValues: []uint64{oldestUnusedHash}},
+				bc.logger, "block cache eventChan")
 		}
 
 		// Add the new block
 		bc.usedBlocks[block] = 1
-		bc.eventChan <- EventData{action: eventActionStore, hashValues: []uint64{block}}
+		common.WriteToChannel(bc.eventChan,
+			EventData{action: eventActionStore, hashValues: []uint64{block}},
+			bc.logger, "block cache eventChan")
 	}
 
 	// store the request mapping
 	bc.requestToBlocks[requestID] = make([]uint64, len(blocks))
 	copy(bc.requestToBlocks[requestID], blocks)
 
 	if bc.usageChan != nil {
-		bc.usageChan <- float64(len(bc.usedBlocks)) / float64(bc.maxBlocks)
+		common.WriteToChannel(bc.usageChan, float64(len(bc.usedBlocks))/float64(bc.maxBlocks),
+			bc.logger, "block cache usageChan")
 	}
 	return len(blockAreadyInUse) + len(blockToMoveToUsed), nil
 }
@@ -188,7 +193,8 @@ func (bc *blockCache) finishRequest(requestID string) error {
 	}
 
 	if bc.usageChan != nil {
-		bc.usageChan <- float64(len(bc.usedBlocks)) / float64(bc.maxBlocks)
+		common.WriteToChannel(bc.usageChan, float64(len(bc.usedBlocks))/float64(bc.maxBlocks),
+			bc.logger, "block cache usageChan")
 	}
 
 	// Remove the request mapping
diff --git a/pkg/llm-d-inference-sim/lora.go b/pkg/llm-d-inference-sim/lora.go
@@ -20,6 +20,7 @@ package llmdinferencesim
 import (
 	"encoding/json"
 
+	"github.com/llm-d/llm-d-inference-sim/pkg/common"
 	"github.com/valyala/fasthttp"
 )
 
@@ -139,6 +140,6 @@ func (s *VllmSimulator) decrementLora(model string) {
 	s.loras.loadedLoras[model]--
 	if s.loras.loadedLoras[model] <= 0 {
 		// last usage of this LoRA
-		s.loras.loraRemovable <- 1
+		common.WriteToChannel(s.loras.loraRemovable, 1, s.logger, "loraRemovable")
 	}
 }
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
@@ -420,7 +420,7 @@ func (s *VllmSimulator) processing(ctx context.Context) {
 
 			s.logger.V(4).Info("Sending the request to the processing channel", "model", model,
 				"req id", reqCtx.CompletionReq.GetRequestID(), "worker", worker.id)
-			worker.reqChan <- reqCtx
+			common.WriteToChannel(worker.reqChan, reqCtx, s.logger, "worker's reqChan")
 		}
 	}
 }
@@ -431,9 +431,9 @@ func (s *VllmSimulator) findRequestAndSendToProcess(worker *worker) bool {
 		// send this request for processing in this worker
 		s.logger.V(4).Info("Sending request to processing", "model", nextReq.CompletionReq.GetModel(),
 			"req", nextReq.CompletionReq.GetRequestID(), "worker", worker.id)
-		worker.reqChan <- nextReq
+		common.WriteToChannel(worker.reqChan, nextReq, s.logger, "worker's reqChan")
 		// decrement waiting requests metric
-		s.metrics.waitingReqChan <- -1
+		common.WriteToChannel(s.metrics.waitingReqChan, -1, s.logger, "metrics.waitingReqChan")
 		return true
 	}
 
@@ -450,9 +450,11 @@ func (s *VllmSimulator) addRequestToQueue(reqCtx *openaiserverapi.CompletionReqC
 		return
 	}
 	// increment the waiting requests metric
-	s.metrics.waitingReqChan <- 1
+	common.WriteToChannel(s.metrics.waitingReqChan, 1, s.logger, "metrics.waitingReqChan")
 	// update loraInfo metrics with the new waiting request
-	s.metrics.lorasChan <- loraUsage{reqCtx.CompletionReq.GetModel(), waitingUsageState}
+	common.WriteToChannel(s.metrics.lorasChan, loraUsage{reqCtx.CompletionReq.GetModel(), waitingUsageState},
+		s.logger, "metrics.lorasChan")
+
 }
 
 // handleCompletions general completion requests handler, support both text and chat completion APIs
@@ -487,18 +489,19 @@ func (s *VllmSimulator) handleCompletions(ctx *fasthttp.RequestCtx, isChatComple
 		IsChatCompletion: isChatCompletion,
 		Wg:               &wg,
 	}
-	s.newRequests <- reqCtx
+	common.WriteToChannel(s.newRequests, reqCtx, s.logger, "newRequests")
 	wg.Wait()
 }
 
 // request processing finished
 func (s *VllmSimulator) responseSentCallback(model string, isChatCompletion bool, requestID string) {
 	// decrement running requests count
-	s.metrics.runReqChan <- -1
+	common.WriteToChannel(s.metrics.runReqChan, -1, s.logger, "metrics.runReqChan")
 
 	if s.isLora(model) {
 		// update loraInfo metrics to reflect that the request processing has been finished
-		s.metrics.lorasChan <- loraUsage{model, doneUsageState}
+		common.WriteToChannel(s.metrics.lorasChan, loraUsage{model, doneUsageState},
+			s.logger, "metrics.lorasChan")
 	}
 
 	if s.config.EnableKVCache && !isChatCompletion {
@@ -580,14 +583,14 @@ func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, r
 	time.Sleep(time.Duration(ttft) * time.Millisecond)
 
 	// report ttft in seconds
-	s.metrics.ttftChan <- (float64(ttft) / 1000)
+	common.WriteToChannel(s.metrics.ttftChan, (float64(ttft) / 1000), s.logger, "metrics.ttftChan")
 
 	for range usageData.CompletionTokens - 1 {
 		perTokenLatency := s.getInterTokenLatency()
 		time.Sleep(time.Duration(perTokenLatency) * time.Millisecond)
 
 		// report tpot in seconds
-		s.metrics.tpotChan <- float64(perTokenLatency) / 1000
+		common.WriteToChannel(s.metrics.tpotChan, (float64(perTokenLatency) / 1000), s.logger, "metrics.tpotChan")
 	}
 	s.sendCompletionResponse(reqCtx.HTTPReqCtx, resp)
 
diff --git a/pkg/llm-d-inference-sim/streaming.go b/pkg/llm-d-inference-sim/streaming.go
@@ -104,14 +104,14 @@ func (s *VllmSimulator) sendTokenChunks(context *streamingContext, w *bufio.Writ
 	ttft := s.getWaitTimeToFirstToken(context.nPromptTokens, context.nCachedPromptTokens, context.doRemotePrefill)
 	time.Sleep(time.Duration(ttft) * time.Millisecond)
 	// report ttft in seconds
-	s.metrics.ttftChan <- (float64(ttft) / 1000)
+	common.WriteToChannel(s.metrics.ttftChan, (float64(ttft) / 1000), s.logger, "metrics.ttftChan")
 
 	for i, token := range genTokens {
 		if i != 0 {
 			interTokenLat := s.getInterTokenLatency()
 			time.Sleep(time.Duration(interTokenLat) * time.Millisecond)
 			// report tpot in seconds
-			s.metrics.tpotChan <- float64(interTokenLat) / 1000
+			common.WriteToChannel(s.metrics.tpotChan, (float64(interTokenLat) / 1000), s.logger, "metrics.tpotChan")
 		}
 
 		var toolChunkInsert *openaiserverapi.ToolCall
diff --git a/pkg/llm-d-inference-sim/worker.go b/pkg/llm-d-inference-sim/worker.go
@@ -21,6 +21,7 @@ import (
 	"context"
 
 	"github.com/go-logr/logr"
+	"github.com/llm-d/llm-d-inference-sim/pkg/common"
 	"github.com/llm-d/llm-d-inference-sim/pkg/dataset"
 	openaiserverapi "github.com/llm-d/llm-d-inference-sim/pkg/openai-server-api"
 	"github.com/valyala/fasthttp"
@@ -63,12 +64,13 @@ func (s *VllmSimulator) processRequest(reqCtx *openaiserverapi.CompletionReqCtx)
 	displayModel := s.getDisplayedModelName(model)
 
 	// increment running requests count
-	s.metrics.runReqChan <- 1
+	common.WriteToChannel(s.metrics.runReqChan, 1, s.logger, "metrics.runReqChan")
 
 	if s.isLora(model) {
 		// update loraInfo metric to reflect that
 		// the request has changed its status from waiting to running
-		s.metrics.lorasChan <- loraUsage{model, runningUsageState}
+		common.WriteToChannel(s.metrics.lorasChan, loraUsage{model, runningUsageState}, s.logger,
+			"metrics.lorasChan")
 	}
 
 	if s.config.EnableKVCache && !reqCtx.IsChatCompletion {
@@ -137,16 +139,13 @@ func (s *VllmSimulator) processRequest(reqCtx *openaiserverapi.CompletionReqCtx)
 			s.sendResponse(reqCtx, responseTokens, toolCalls, displayModel, finishReason, &usageData)
 		}
 
-		select {
-		case s.metrics.requestSuccessChan <- requestSuccessEvent{
-			promptTokens:     usageData.PromptTokens,
-			generationTokens: usageData.CompletionTokens,
-			maxTokens:        reqCtx.CompletionReq.GetMaxCompletionTokens(),
-			finishReason:     finishReason,
-		}:
-		default:
-			s.logger.V(1).Info("requestSuccessChan full, dropping success event")
-		}
+		common.WriteToChannel(s.metrics.requestSuccessChan,
+			requestSuccessEvent{
+				promptTokens:     usageData.PromptTokens,
+				generationTokens: usageData.CompletionTokens,
+				maxTokens:        reqCtx.CompletionReq.GetMaxCompletionTokens(),
+				finishReason:     finishReason},
+			s.logger, "metrics.requestSuccessChan")
 	}
 
 	s.logger.V(4).Info("Finished processing request", "id", req.GetRequestID())

Original file line number	Diff line number	Diff line change
`@@ -70,10 +70,10 @@ func newBlockCache(config *common.Configuration, logger logr.Logger, usageChan c`
`70`	`70`	`}, nil`
`71`	`71`	`}`
`72`	`72`
`73`		`-func (b *blockCache) start(ctx context.Context) {`
`74`		`- err := b.eventSender.Run(ctx)`
	`73`	`+func (bc *blockCache) start(ctx context.Context) {`
	`74`	`+ err := bc.eventSender.Run(ctx)`
`75`	`75`	`if err != nil {`
`76`		`- b.logger.Info("sender stopped with error", "error", err)`
	`76`	`+ bc.logger.Info("sender stopped with error", "error", err)`
`77`	`77`	`}`
`78`	`78`	`}`
`79`	`79`
`@@ -139,20 +139,25 @@ func (bc *blockCache) startRequest(requestID string, blocks []uint64) (int, erro`
`139`	`139`	`}`
`140`	`140`
`141`	`141`	`delete(bc.unusedBlocks, oldestUnusedHash)`
`142`		`- bc.eventChan <- EventData{action: eventActionRemove, hashValues: []uint64{oldestUnusedHash}}`
	`142`	`+ common.WriteToChannel(bc.eventChan,`
	`143`	`+ EventData{action: eventActionRemove, hashValues: []uint64{oldestUnusedHash}},`
	`144`	`+ bc.logger, "block cache eventChan")`
`143`	`145`	`}`
`144`	`146`
`145`	`147`	`// Add the new block`
`146`	`148`	`bc.usedBlocks[block] = 1`
`147`		`- bc.eventChan <- EventData{action: eventActionStore, hashValues: []uint64{block}}`
	`149`	`+ common.WriteToChannel(bc.eventChan,`
	`150`	`+ EventData{action: eventActionStore, hashValues: []uint64{block}},`
	`151`	`+ bc.logger, "block cache eventChan")`
`148`	`152`	`}`
`149`	`153`
`150`	`154`	`// store the request mapping`
`151`	`155`	`bc.requestToBlocks[requestID] = make([]uint64, len(blocks))`
`152`	`156`	`copy(bc.requestToBlocks[requestID], blocks)`
`153`	`157`
`154`	`158`	`if bc.usageChan != nil {`
`155`		`- bc.usageChan <- float64(len(bc.usedBlocks)) / float64(bc.maxBlocks)`
	`159`	`+ common.WriteToChannel(bc.usageChan, float64(len(bc.usedBlocks))/float64(bc.maxBlocks),`
	`160`	`+ bc.logger, "block cache usageChan")`
`156`	`161`	`}`
`157`	`162`	`return len(blockAreadyInUse) + len(blockToMoveToUsed), nil`
`158`	`163`	`}`
`@@ -188,7 +193,8 @@ func (bc *blockCache) finishRequest(requestID string) error {`
`188`	`193`	`}`
`189`	`194`
`190`	`195`	`if bc.usageChan != nil {`
`191`		`- bc.usageChan <- float64(len(bc.usedBlocks)) / float64(bc.maxBlocks)`
	`196`	`+ common.WriteToChannel(bc.usageChan, float64(len(bc.usedBlocks))/float64(bc.maxBlocks),`
	`197`	`+ bc.logger, "block cache usageChan")`
`192`	`198`	`}`
`193`	`199`
`194`	`200`	`// Remove the request mapping`
Original file line number	Diff line number	Diff line change
`@@ -20,6 +20,7 @@ package llmdinferencesim`
`20`	`20`	`import (`
`21`	`21`	`"encoding/json"`
`22`	`22`
	`23`	`+ "github.com/llm-d/llm-d-inference-sim/pkg/common"`
`23`	`24`	`"github.com/valyala/fasthttp"`
`24`	`25`	`)`
`25`	`26`
`@@ -139,6 +140,6 @@ func (s *VllmSimulator) decrementLora(model string) {`
`139`	`140`	`s.loras.loadedLoras[model]--`
`140`	`141`	`if s.loras.loadedLoras[model] <= 0 {`
`141`	`142`	`// last usage of this LoRA`
`142`		`- s.loras.loraRemovable <- 1`
	`143`	`+ common.WriteToChannel(s.loras.loraRemovable, 1, s.logger, "loraRemovable")`
`143`	`144`	`}`
`144`	`145`	`}`