llm-d · mayabar · Oct 29, 2025 · Oct 29, 2025 · Oct 29, 2025
diff --git a/README.md b/README.md
@@ -65,32 +65,97 @@ API responses contains a subset of the fields provided by the OpenAI API.
         - messages
             - role
             - content
+            - tool_calls
+              - function
+                - name
+                - arguments
+	            - id
+              - type
+              - index
+        - max_tokens
+        - max_completion_tokens
+        - tools 
+          - type
+          - function
+            - name
+            - arguments
+        - tool_choice
+        - logprobs
+        - top_logprobs
+        - stream_options
+          - include_usage
+        - do_remote_decode
+        - do_remote_prefill
+        - remote_block_ids
+        - remote_engine_id
+        - remote_host
+        - remote_port
+        - ignore_eos
     - **response**
         - id
         - created
         - model
         - choices
-            - index
-            - finish_reason
-            - message
+          - index
+          - finish_reason
+          - message
+          - logprobs
+            - content
+              - token
+              - logprob
+              - bytes
+              - top_logprobs
+        - usage
+        - object
+        - do_remote_decode
+        - do_remote_prefill
+        - remote_block_ids
+        - remote_engine_id
+        - remote_host
+        - remote_port
 - `/v1/completions`
     - **request**
         - stream
         - model
         - prompt
-        - max_tokens (for future usage)
+        - max_tokens
+        - stream_options
+          - include_usage
+        - do_remote_decode
+        - do_remote_prefill
+        - remote_block_ids
+        - remote_engine_id
+        - remote_host
+        - remote_port
+        - ignore_eos
+        - logprobs
     - **response**
         - id
         - created
         - model
         - choices
-            - text
+          - index
+          - finish_reason
+          - text
+          - logprobs
+            - tokens
+            - token_logprobs
+            - top_logprobs
+            - text_offset
+        - usage
+        - object
+        - do_remote_decode
+        - do_remote_prefill
+        - remote_block_ids
+        - remote_engine_id
+        - remote_host
+        - remote_port
 - `/v1/models`
     - **response**
-        - object (list)
+        - object
         - data
             - id
-            - object (model)
+            - object
             - created
             - owned_by
             - root
@@ -158,8 +223,22 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
     - `loras` - an array containing LoRA information objects, each with the fields: `running` (a comma-separated list of LoRAs in use by running requests), `waiting` (a comma-separated list of LoRAs to be used by waiting requests), and `timestamp` (seconds since Jan 1 1970, the timestamp of this metric). 
     - `ttft-buckets-values` - array of values for time-to-first-token buckets, each value in this array is a value for the corresponding bucket. Array may contain less values than number of buckets, all trailing missing values assumed as 0. Buckets upper boundaries are: 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0, 2560.0, +Inf.
     - `tpot-buckets-values` - array of values for time-per-output-token buckets, each value in this array is a value for the corresponding bucket. Array may contain less values than number of buckets, all trailing missing values assumed as 0. Buckets upper boundaries are: 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, +Inf.
+    - `e2erl-buckets-values` - array of values for e2e request latency buckets, each value in this array is a value for the corresponding bucket. Array may contain less values than number of buckets, all trailing missing values assumed as 0. Buckets upper boundaries are: 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 
+    960.0, 1920.0, 7680.0, +Inf.
+    - `queue-time-buckets-values` - array of values for request queue time buckets, each value in this array is a value for the corresponding bucket. Array may contain less values than number of buckets, all trailing missing values assumed as 0. Buckets upper boundaries are: 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 
+    960.0, 1920.0, 7680.0, +Inf.
+    - `inf-time-buckets-values` - array of values for request inference time buckets, each value in this array is a value for the corresponding bucket. Array may contain less values than number of buckets, all trailing missing values assumed as 0. Buckets upper boundaries are: 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 
+    960.0, 1920.0, 7680.0, +Inf.
+    - `prefill-time-buckets-values` -  array of values for request prefill time buckets, each value in this array is a value for the corresponding bucket. Array may contain less values than number of buckets, all trailing missing values assumed as 0. Buckets upper boundaries are: 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 
+    960.0, 1920.0, 7680.0, +Inf.
+    - `decode-time-buckets-values` - array of values for request decode time buckets, each value in this array is a value for the corresponding bucket. Array may contain less values than number of buckets, all trailing missing values assumed as 0. Buckets upper boundaries are: 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 
+    960.0, 1920.0, 7680.0, +Inf.
+    - `request-prompt-tokens` - array of values for prompt-length buckets
+    - `request-generation-tokens` - array of values for generation-length buckets
+    - `request-params-max-tokens` - array of values for  max_tokens parameter buckets
+    - `request-success-total` - number of successful requests per finish reason, key: finish-reason (stop, length, etc.).
     <br>
-    Example:<br>
+    **Example:**<br>
       --fake-metrics '{"running-requests":10,"waiting-requests":30,"kv-cache-usage":0.4,"loras":[{"running":"lora4,lora2","waiting":"lora3","timestamp":1257894567},{"running":"lora4,lora3","waiting":"","timestamp":1257894569}]}'
 ---
 - `data-parallel-size`: number of ranks to run in Data Parallel deployment, from 1 to 8, default is 1. The ports will be assigned as follows: rank 0 will run on the configured `port`, rank 1 on `port`+1, etc.      
@@ -177,6 +256,10 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
   - Example URL `https://huggingface.co/datasets/hf07397/inference-sim-datasets/resolve/91ffa7aafdfd6b3b1af228a517edc1e8f22cd274/huggingface/ShareGPT_Vicuna_unfiltered/conversations.sqlite3`
 - `dataset-in-memory`: If true, the entire dataset will be loaded into memory for faster access. This may require significant memory depending on the size of the dataset. Default is false.
 ---
+- `ssl-certfile`: Path to SSL certificate file for HTTPS (optional)
+- `ssl-keyfile`: Path to SSL private key file for HTTPS (optional)
+- `self-signed-certs`: Enable automatic generation of self-signed certificates for HTTPS
+---
 In addition, as we are using klog, the following parameters are available:
 - `add_dir_header`: if true, adds the file directory to the header of the log messages
 - `alsologtostderr`: log to standard error as well as files (no effect when -logtostderr=true)
@@ -189,7 +272,11 @@ In addition, as we are using klog, the following parameters are available:
 - `skip_headers`: if true, avoid header prefixes in the log messages
 - `skip_log_headers`: if true, avoid headers when opening log files (no effect when -logtostderr=true)
 - `stderrthreshold`: logs at or above this threshold go to stderr when writing to files and stderr (no effect when -logtostderr=true or -alsologtostderr=true) (default 2)
-- `v`: number for the log level verbosity
+- `v`: number for the log level verbosity. Supported levels:
+  - Warning (1) - warning messages
+  - Info (2) - general application messages, e.g., loaded configuration content, which responses dataset was loaded, etc.
+  - Debug (4) - debugging messages, e.g. /completions and /chat/completions request received, load/unload lora request processed, etc.
+  - Trace (5) - highest verbosity, e.g. detailed messages on completions request handling and request queue processing, etc.
 - `vmodule`: comma-separated list of pattern=N settings for file-filtered logging
 
 ## Environment variables

diff --git a/cmd/llm-d-inference-sim/main.go b/cmd/llm-d-inference-sim/main.go
@@ -24,6 +24,7 @@ import (
 	"k8s.io/klog/v2"
 
 	"github.com/llm-d/llm-d-inference-sim/cmd/signals"
+	"github.com/llm-d/llm-d-inference-sim/pkg/common/logging"
 	vllmsim "github.com/llm-d/llm-d-inference-sim/pkg/llm-d-inference-sim"
 )
 
@@ -33,11 +34,11 @@ func main() {
 	ctx := klog.NewContext(context.Background(), logger)
 	ctx = signals.SetupSignalHandler(ctx)
 
-	logger.Info("Starting vLLM simulator")
+	logger.V(logging.INFO).Info("Starting vLLM simulator")
 
 	vllmSim, err := vllmsim.New(logger)
 	if err != nil {
-		logger.Error(err, "Failed to create vLLM simulator")
+		logger.Error(err, "failed to create vLLM simulator")
 		return
 	}
 	if err := vllmSim.Start(ctx); err != nil {

diff --git a/pkg/common/logging/levels.go b/pkg/common/logging/levels.go
@@ -0,0 +1,24 @@
+/*
+Copyright 2025 The llm-d-inference-sim Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package logging
+
+const (
+	WARN  = 1
+	INFO  = 2
+	DEBUG = 4
+	TRACE = 5
+)
diff --git a/pkg/common/publisher.go b/pkg/common/publisher.go
@@ -25,6 +25,7 @@ import (
 	"sync/atomic"
 	"time"
 
+	"github.com/llm-d/llm-d-inference-sim/pkg/common/logging"
 	zmq "github.com/pebbe/zmq4"
 	"github.com/vmihailenco/msgpack/v5"
 	"k8s.io/klog/v2"
@@ -93,7 +94,7 @@ func (p *Publisher) PublishEvent(ctx context.Context, topic string, batch interf
 		return fmt.Errorf("failed to send message to topic %s: %w", topic, err)
 	}
 
-	logger.Info("Published event batch", "topic", topic, "seq", seq)
+	logger.V(logging.TRACE).Info("Published event batch", "topic", topic, "seq", seq)
 	return nil
 }
 

diff --git a/pkg/common/utils.go b/pkg/common/utils.go
@@ -23,6 +23,7 @@ import (
 
 	"github.com/go-logr/logr"
 	"github.com/google/uuid"
+	"github.com/llm-d/llm-d-inference-sim/pkg/common/logging"
 )
 
 // Definition of buckets for time-to-first-token and time-per-output-token metrics, each value is an upper boundary of a bucket
@@ -149,6 +150,6 @@ func WriteToChannel[T any](channel chan T, object T, logger logr.Logger, channel
 	select {
 	case channel <- object:
 	default:
-		logger.V(1).Info("failed to write to", "channel", channelName)
+		logger.V(logging.WARN).Info("failed to write to", "channel", channelName)
 	}
 }
diff --git a/pkg/dataset/custom_dataset.go b/pkg/dataset/custom_dataset.go
@@ -33,6 +33,7 @@ import (
 
 	"github.com/go-logr/logr"
 	"github.com/llm-d/llm-d-inference-sim/pkg/common"
+	"github.com/llm-d/llm-d-inference-sim/pkg/common/logging"
 	openaiserverapi "github.com/llm-d/llm-d-inference-sim/pkg/openai-server-api"
 )
 
@@ -80,7 +81,7 @@ func (d *CustomDataset) downloadDataset(ctx context.Context, url string, path st
 		}
 	}()
 
-	d.logger.Info("Using dataset-url", "dataset-url", url)
+	d.logger.V(logging.INFO).Info("Using dataset-url", "dataset-url", url)
 	resp, err := http.Get(url)
 	if err != nil {
 		return err
@@ -181,9 +182,9 @@ func (pr *progressReader) logProgress(pct int) {
 	speed := float64(pr.downloaded) / (1024 * 1024 * elapsedTime)
 	remainingTime := float64(pr.total-pr.downloaded) / (float64(pr.downloaded) / elapsedTime)
 	if pct != 100 {
-		pr.logger.Info(fmt.Sprintf("Download progress: %d%%, Speed: %.2f MB/s, Remaining time: %.2fs", pct, speed, remainingTime))
+		pr.logger.V(logging.INFO).Info("Dataset download progress", "%", pct, "speed (MB/s)", speed, "remaining time (s)", remainingTime)
 	} else {
-		pr.logger.Info(fmt.Sprintf("Download completed: 100%%, Average Speed: %.2f MB/s, Total time: %.2fs", speed, elapsedTime))
+		pr.logger.V(logging.INFO).Info("Download completed", "average speed (MB/s)", speed, "total time (s)", elapsedTime)
 	}
 }
 
@@ -248,7 +249,7 @@ func (d *CustomDataset) getRecordsCount() (int, error) {
 }
 
 func (d *CustomDataset) loadDatabaseInMemory(path string) error {
-	d.logger.Info("Loading database into memory...")
+	d.logger.V(logging.INFO).Info("Loading database into memory...")
 	start := time.Now()
 
 	// Create in-memory database
@@ -301,7 +302,7 @@ func (d *CustomDataset) loadDatabaseInMemory(path string) error {
 	}
 
 	loadTime := time.Since(start)
-	d.logger.Info("Database loaded into memory", "load_time", loadTime.String())
+	d.logger.V(logging.INFO).Info("Database loaded into memory", "load_time", loadTime.String())
 	return nil
 }
 
@@ -354,9 +355,9 @@ func (d *CustomDataset) connectToDB(path string, useInMemory bool) error {
 	}
 
 	if useInMemory {
-		d.logger.Info("In-memory database connected successfully", "path", path, "records count", count)
+		d.logger.V(logging.INFO).Info("In-memory database connected successfully", "path", path, "records count", count)
 	} else {
-		d.logger.Info("Database connected successfully", "path", path, "records count", count)
+		d.logger.V(logging.INFO).Info("Database connected successfully", "path", path, "records count", count)
 	}
 	return nil
 }
@@ -368,7 +369,7 @@ func (d *CustomDataset) Init(ctx context.Context, logger logr.Logger, path strin
 	}
 	d.hasWarned = false
 	if url == "" {
-		d.logger.Info("Using dataset from", "path", path)
+		d.logger.V(logging.INFO).Info("Using dataset from", "path", path)
 		return d.connectToDB(path, useInMemory)
 	}
 	_, err := os.Stat(path)
@@ -386,7 +387,7 @@ func (d *CustomDataset) Init(ctx context.Context, logger logr.Logger, path strin
 			return fmt.Errorf("failed to download dataset: %w", err)
 		}
 	}
-	d.logger.Info("Using dataset path", "dataset-path", path)
+	d.logger.V(logging.INFO).Info("Using dataset path", "dataset-path", path)
 
 	return d.connectToDB(path, useInMemory)
 }
@@ -448,7 +449,7 @@ func (d *CustomDataset) query(query string, nTokens int, random *common.Random)
 	rows, err := d.db.Query(query)
 	if err != nil {
 		if !d.hasWarned {
-			d.logger.Error(err, "Failed to query database. Ensure dataset file is still valid. Will generate random tokens instead.")
+			d.logger.Error(err, "failed to query database. Ensure dataset file is still valid. Will generate random tokens instead.")
 			d.hasWarned = true
 		}
 		return [][]string{GenPresetRandomTokens(random, nTokens)}, nil
@@ -472,7 +473,7 @@ func (d *CustomDataset) GenerateTokens(req openaiserverapi.CompletionRequest, nT
 	// filter out results according to finish reason
 	var filteredTokensList [][]string
 	if finishReason != LengthFinishReason && finishReason != StopFinishReason {
-		d.logger.Error(errors.New("unknown finish reason"), "Unexpected finish reason", "reason", finishReason)
+		d.logger.Error(errors.New("unknown finish reason"), "unexpected finish reason", "reason", finishReason)
 	}
 	for _, tokens := range tokensList {
 		if finishReason == StopFinishReason && len(tokens) <= nTokens {

diff --git a/pkg/kv-cache/block_cache.go b/pkg/kv-cache/block_cache.go
@@ -73,7 +73,7 @@ func newBlockCache(config *common.Configuration, logger logr.Logger, usageChan c
 func (bc *blockCache) start(ctx context.Context) {
 	err := bc.eventSender.Run(ctx)
 	if err != nil {
-		bc.logger.Info("sender stopped with error", "error", err)
+		bc.logger.Error(err, "Sender stopped with error")
 	}
 }
 

diff --git a/pkg/kv-cache/kv_cache.go b/pkg/kv-cache/kv_cache.go
@@ -22,6 +22,7 @@ import (
 
 	"github.com/go-logr/logr"
 	"github.com/llm-d/llm-d-inference-sim/pkg/common"
+	"github.com/llm-d/llm-d-inference-sim/pkg/common/logging"
 	openaiserverapi "github.com/llm-d/llm-d-inference-sim/pkg/openai-server-api"
 	"github.com/llm-d/llm-d-kv-cache-manager/pkg/kvcache/kvblock"
 	"github.com/llm-d/llm-d-kv-cache-manager/pkg/tokenization"
@@ -63,7 +64,7 @@ func (h *KVCacheHelper) Run(ctx context.Context) {
 }
 
 func (h *KVCacheHelper) OnRequestStart(vllmReq openaiserverapi.CompletionRequest) error {
-	h.logger.Info("KV cache - process request")
+	h.logger.V(logging.TRACE).Info("KV cache - process request")
 
 	prompt := vllmReq.GetPrompt()
 	modelName := vllmReq.GetModel()
@@ -72,13 +73,13 @@ func (h *KVCacheHelper) OnRequestStart(vllmReq openaiserverapi.CompletionRequest
 	// tokenize the input
 	tokens, _, err := h.tokenizer.Encode(prompt, modelName)
 	if err != nil {
-		h.logger.Info("Prompt tokenization failed", "error", err.Error())
+		h.logger.Error(err, "prompt tokenization failed")
 		return err
 	}
 
 	// get block keys
 	blockKeys := h.tokensProcessor.TokensToKVBlockKeys(tokens, modelName)
-	h.logger.Info("found tokens", "tokens", tokens, "block-keys", blockKeys)
+	h.logger.V(logging.TRACE).Info("Found tokens", "tokens", tokens, "block-keys", blockKeys)
 
 	blockHashes := make([]uint64, len(blockKeys))
 	for i, key := range blockKeys {

diff --git a/pkg/kv-cache/kv_cache_sender.go b/pkg/kv-cache/kv_cache_sender.go
@@ -22,6 +22,7 @@ import (
 
 	"github.com/go-logr/logr"
 	"github.com/llm-d/llm-d-inference-sim/pkg/common"
+	"github.com/llm-d/llm-d-inference-sim/pkg/common/logging"
 	"github.com/llm-d/llm-d-kv-cache-manager/pkg/kvcache/kvevents"
 	"github.com/vmihailenco/msgpack/v5"
 )
@@ -70,15 +71,15 @@ func (s *KVEventSender) Run(ctx context.Context) error {
 		case <-ctx.Done():
 			// Exiting, discard remaining events if any
 			if len(s.batch) > 0 {
-				s.logger.Info("Existing, discard remaining events", "num of events", len(s.batch))
+				s.logger.V(logging.INFO).Info("Exiting, discard remaining events", "num of events", len(s.batch))
 			}
 			return ctx.Err()
 
 		case eventData, ok := <-s.eventChan:
 			if !ok {
 				// Channel closed, discard remaining events and exit
 				if len(s.batch) > 0 {
-					s.logger.Info("Channel closed, discard remaining events", "num of events", len(s.batch))
+					s.logger.V(logging.INFO).Info("Channel closed, discard remaining events", "num of events", len(s.batch))
 				}
 				return nil
 			}