llm-d
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pkg/common/config.go‎
Lines changed: 9 additions & 1 deletion b/‎pkg/common/config.go‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎pkg/common/config_test.go‎
Lines changed: 10 additions & 0 deletions b/‎pkg/common/config_test.go‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎pkg/llm-d-inference-sim/latencies.go‎
Lines changed: 1 addition & 1 deletion b/‎pkg/llm-d-inference-sim/latencies.go‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pkg/llm-d-inference-sim/latencies_test.go‎
Lines changed: 11 additions & 9 deletions b/‎pkg/llm-d-inference-sim/latencies_test.go‎
Lines changed: 11 additions & 9 deletions
diff --git a/‎pkg/llm-d-inference-sim/lora.go‎
Lines changed: 74 additions & 2 deletions b/‎pkg/llm-d-inference-sim/lora.go‎
Lines changed: 74 additions & 2 deletions
@@ -98,6 +98,7 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
 - `max-cpu-loras`: maximum number of LoRAs to store in CPU memory, optional, must be >= than max-loras, default is max-loras
 - `max-model-len`: model's context window, maximum number of tokens in a single request including input and output, optional, default is 1024
 - `max-num-seqs`: maximum number of sequences per iteration (maximum number of inference requests that could be processed at the same time), default is 5
+- `max-waiting-queue-length`: maximum length of inference requests waiting queue, default is 1000
 - `mode`: the simulator mode, optional, by default `random`
     - `echo`: returns the same text that was sent in the request
     - `random`: returns a sentence chosen at random from a set of pre-defined sentences
 
@@ -80,6 +80,8 @@ type Configuration struct {
 	// MaxNumSeqs is maximum number of sequences per iteration (the maximum
 	// number of inference requests that could be processed at the same time)
 	MaxNumSeqs int `yaml:"max-num-seqs" json:"max-num-seqs"`
+	// MaxWaitingQueueLength defines maximum size of waiting requests queue
+	MaxWaitingQueueLength int `yaml:"max-waiting-queue-length" json:"max-waiting-queue-length"`
 	// MaxModelLen is the model's context window, the maximum number of tokens
 	// in a single request including input and output. Default value is 1024.
 	MaxModelLen int `yaml:"max-model-len" json:"max-model-len"`
@@ -329,6 +331,7 @@ func newConfig() *Configuration {
 		Port:                                vLLMDefaultPort,
 		MaxLoras:                            1,
 		MaxNumSeqs:                          5,
+		MaxWaitingQueueLength:               1000,
 		MaxModelLen:                         1024,
 		Mode:                                ModeRandom,
 		Seed:                                time.Now().UnixNano(),
@@ -458,6 +461,10 @@ func (c *Configuration) validate() error {
 		return errors.New("max num seqs cannot be less than 1")
 	}
 
+	if c.MaxWaitingQueueLength < 1 {
+		return errors.New("max waiting queue size cannot be less than 1")
+	}
+
 	for _, lora := range c.LoraModules {
 		if lora.Name == "" {
 			return errors.New("empty LoRA name")
@@ -637,7 +644,8 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
 
 	f.IntVar(&config.Port, "port", config.Port, "Port")
 	f.StringVar(&config.Model, "model", config.Model, "Currently 'loaded' model")
-	f.IntVar(&config.MaxNumSeqs, "max-num-seqs", config.MaxNumSeqs, "Maximum number of inference requests that could be processed at the same time (parameter to simulate requests waiting queue)")
+	f.IntVar(&config.MaxNumSeqs, "max-num-seqs", config.MaxNumSeqs, "Maximum number of inference requests that could be processed at the same time")
+	f.IntVar(&config.MaxWaitingQueueLength, "max-waiting-queue-length", config.MaxWaitingQueueLength, "Maximum length of inference requests waiting queue")
 	f.IntVar(&config.MaxLoras, "max-loras", config.MaxLoras, "Maximum number of LoRAs in a single batch")
 	f.IntVar(&config.MaxCPULoras, "max-cpu-loras", config.MaxCPULoras, "Maximum number of LoRAs to store in CPU memory")
 	f.IntVar(&config.MaxModelLen, "max-model-len", config.MaxModelLen, "Model's context window, maximum number of tokens in a single request including input and output")
 
@@ -452,6 +452,16 @@ var _ = Describe("Simulator configuration", func() {
 			args: []string{"cmd", "--max-num-seqs", "-1",
 				"--config", "../../manifests/config.yaml"},
 		},
+		{
+			name: "invalid max-waiting-queue-length",
+			args: []string{"cmd", "--max-waiting-queue-length", "0",
+				"--config", "../../manifests/config.yaml"},
+		},
+		{
+			name: "invalid max-waiting-queue-length",
+			args: []string{"cmd", "--max-waiting-queue-length", "-1",
+				"--config", "../../manifests/config.yaml"},
+		},
 		{
 			name: "invalid time-factor-under-load",
 			args: []string{"cmd", "--time-factor-under-load", "0",
 
@@ -23,7 +23,7 @@ func (s *VllmSimulator) getCurrLoadFactor() float64 {
 	if s.config.MaxNumSeqs <= 1 {
 		return 1.0
 	}
-	return 1 + (s.config.TimeFactorUnderLoad-1)*float64(s.nRunningReqs-1)/float64(s.config.MaxNumSeqs-1)
+	return 1 + (s.config.TimeFactorUnderLoad-1)*float64(s.metrics.nRunningReqs-1)/float64(s.config.MaxNumSeqs-1)
 }
 
 func (s *VllmSimulator) getTimeToFirstToken() int {
 
@@ -41,6 +41,8 @@ var _ = Describe("Check random latencies", Ordered, func() {
 			KVCacheTransferLatencyStdDev: 2048,
 		}
 
+		simulator.metrics.runReqChan = make(chan int64, 100)
+
 		common.InitRandom(time.Now().UnixNano())
 	})
 
@@ -245,7 +247,7 @@ var _ = Describe("Check random latencies", Ordered, func() {
 		simulator.config.TimeToFirstTokenStdDev = 0
 		simulator.config.TimeFactorUnderLoad = 1.0
 
-		simulator.runReqChan <- 100
+		simulator.metrics.runReqChan <- 100
 
 		ttft := simulator.getWaitTimeToFirstToken(128, 0, false)
 		Expect(ttft).To(Equal(42))
@@ -257,11 +259,11 @@ var _ = Describe("Check random latencies", Ordered, func() {
 		simulator.config.TimeFactorUnderLoad = 100.0
 		simulator.config.MaxNumSeqs = 1
 
-		for len(simulator.runReqChan) > 0 {
-			<-simulator.runReqChan
+		for len(simulator.metrics.runReqChan) > 0 {
+			<-simulator.metrics.runReqChan
 		}
 
-		simulator.runReqChan <- 1
+		simulator.metrics.runReqChan <- 1
 
 		ttft := simulator.getWaitTimeToFirstToken(128, 0, false)
 		Expect(ttft).To(Equal(42))
@@ -273,7 +275,7 @@ var _ = Describe("Check random latencies", Ordered, func() {
 			simulator.config.TimeToFirstTokenStdDev = 0
 			simulator.config.TimeFactorUnderLoad = timeFactorUnderLoad
 			simulator.config.MaxNumSeqs = maxNumOfReq
-			simulator.nRunningReqs = int64(maxNumOfReq)
+			simulator.metrics.nRunningReqs = int64(maxNumOfReq)
 
 			ttft := simulator.getWaitTimeToFirstToken(128, 0, false)
 			Expect(ttft).To(Equal(int(float64(42) * timeFactorUnderLoad)))
@@ -296,7 +298,7 @@ var _ = Describe("Check random latencies", Ordered, func() {
 			simulator.config.TimeToFirstTokenStdDev = 0
 			simulator.config.TimeFactorUnderLoad = timeFactorUnderLoad
 			simulator.config.MaxNumSeqs = maxNumOfReq
-			simulator.nRunningReqs = int64(nCurrNumOfReq)
+			simulator.metrics.nRunningReqs = int64(nCurrNumOfReq)
 
 			ttft := simulator.getWaitTimeToFirstToken(128, 0, false)
 			max := timeFactorUnderLoad * float64(42)
@@ -318,7 +320,7 @@ var _ = Describe("Check random latencies", Ordered, func() {
 	It("when TimeFactorUnderLoad is 1.0, calcLoadFactor should give 1", func() {
 		simulator.config.TimeFactorUnderLoad = 1.0
 		simulator.config.MaxNumSeqs = 11
-		simulator.nRunningReqs = 3
+		simulator.metrics.nRunningReqs = 3
 
 		factor := simulator.getCurrLoadFactor()
 		Expect(factor).To(BeNumerically("==", 1.0))
@@ -327,7 +329,7 @@ var _ = Describe("Check random latencies", Ordered, func() {
 	It("when TimeFactorUnderLoad is > 1.0, and sim is fully loaded, calcLoadFactor should give TimeFactorUnderLoad", func() {
 		simulator.config.TimeFactorUnderLoad = 2.0
 		simulator.config.MaxNumSeqs = 11
-		simulator.nRunningReqs = 11
+		simulator.metrics.nRunningReqs = 11
 
 		factor := simulator.getCurrLoadFactor()
 		Expect(factor).To(BeNumerically("==", simulator.config.TimeFactorUnderLoad))
@@ -337,7 +339,7 @@ var _ = Describe("Check random latencies", Ordered, func() {
 	It("when TimeFactorUnderLoad is > 1.0, and sim is partially loaded, calcLoadFactor should give a value between 1 and TimeFactorUnderLoad", func() {
 		simulator.config.TimeFactorUnderLoad = 2.0
 		simulator.config.MaxNumSeqs = 11
-		simulator.nRunningReqs = 6
+		simulator.metrics.nRunningReqs = 6
 
 		factor := simulator.getCurrLoadFactor()
 		Expect(factor).To(BeNumerically(">", 1.0))
 
@@ -47,7 +47,7 @@ func (s *VllmSimulator) getLoras() []string {
 	return loras
 }
 
-func (s *VllmSimulator) loadLora(ctx *fasthttp.RequestCtx) {
+func (s *VllmSimulator) loadLoraAdaptor(ctx *fasthttp.RequestCtx) {
 	var req loadLoraRequest
 	err := json.Unmarshal(ctx.Request.Body(), &req)
 	if err != nil {
@@ -59,7 +59,7 @@ func (s *VllmSimulator) loadLora(ctx *fasthttp.RequestCtx) {
 	s.loraAdaptors.Store(req.LoraName, "")
 }
 
-func (s *VllmSimulator) unloadLora(ctx *fasthttp.RequestCtx) {
+func (s *VllmSimulator) unloadLoraAdaptor(ctx *fasthttp.RequestCtx) {
 	var req unloadLoraRequest
 	err := json.Unmarshal(ctx.Request.Body(), &req)
 	if err != nil {
@@ -70,3 +70,75 @@ func (s *VllmSimulator) unloadLora(ctx *fasthttp.RequestCtx) {
 
 	s.loraAdaptors.Delete(req.LoraName)
 }
+
+// Checks if the LoRA adaptor is loaded
+func (s *VllmSimulator) loraIsLoaded(model string) bool {
+	if !s.isLora(model) {
+		return true
+	}
+
+	s.loras.mux.RLock()
+	defer s.loras.mux.RUnlock()
+
+	_, ok := s.loras.loadedLoras[model]
+	return ok
+}
+
+// Load the LoRA adaptor if possible. Return false if not.
+func (s *VllmSimulator) loadLora(model string) bool {
+	if !s.isLora(model) {
+		return true
+	}
+
+	s.loras.mux.Lock()
+	defer s.loras.mux.Unlock()
+
+	// check if this LoRA is already loaded or within maxLoras slots
+	_, ok := s.loras.loadedLoras[model]
+	ok = ok || len(s.loras.loadedLoras) < s.loras.maxLoras
+	if !ok {
+		// if this LoRA is not loaded, and the number of loaded LoRAs reached
+		// maxLoras, try to find a LoRA that is not in use, and unload it
+		for lora, count := range s.loras.loadedLoras {
+			if count == 0 {
+				delete(s.loras.loadedLoras, lora)
+				ok = true
+				break
+			}
+		}
+	}
+	if ok {
+		s.loras.loadedLoras[model]++
+	}
+	return ok
+}
+
+// incrementLora increments the count of running requests using the model
+// (if the model is a LoRA). Can be called only for loaded LoRAs (that are
+// already in loras.loadedLoras)
+func (s *VllmSimulator) incrementLora(model string) {
+	if !s.isLora(model) {
+		return
+	}
+
+	s.loras.mux.Lock()
+	defer s.loras.mux.Unlock()
+	s.loras.loadedLoras[model]++
+}
+
+// decrementLora decrements the count of running requests using the model
+// (if the model is a LoRA)
+func (s *VllmSimulator) decrementLora(model string) {
+	if model == "" || !s.isLora(model) {
+		return
+	}
+
+	s.loras.mux.Lock()
+	defer s.loras.mux.Unlock()
+
+	s.loras.loadedLoras[model]--
+	if s.loras.loadedLoras[model] <= 0 {
+		// last usage of this LoRA
+		s.loras.loraRemovable <- 1
+	}
+}
Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,7 @@ func (s *VllmSimulator) getCurrLoadFactor() float64 {`
`23`	`23`	`if s.config.MaxNumSeqs <= 1 {`
`24`	`24`	`return 1.0`
`25`	`25`	`}`
`26`		`- return 1 + (s.config.TimeFactorUnderLoad-1)*float64(s.nRunningReqs-1)/float64(s.config.MaxNumSeqs-1)`
	`26`	`+ return 1 + (s.config.TimeFactorUnderLoad-1)*float64(s.metrics.nRunningReqs-1)/float64(s.config.MaxNumSeqs-1)`
`27`	`27`	`}`
`28`	`28`
`29`	`29`	`func (s *VllmSimulator) getTimeToFirstToken() int {`