Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ check-ginkgo:
.PHONY: check-golangci-lint
check-golangci-lint:
@command -v golangci-lint >/dev/null 2>&1 || { \
echo "❌ golangci-lint is not installed. Install from https://golangci-lint.run/usage/install/"; exit 1; }
echo "❌ golangci-lint is not installed. Install from https://golangci-lint.run/docs/welcome/install/"; exit 1; }

.PHONY: check-container-tool
check-container-tool:
Expand Down
67 changes: 34 additions & 33 deletions pkg/common/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,100 +39,101 @@ const (

type Configuration struct {
// Port defines on which port the simulator runs
Port int `yaml:"port"`
Port int `yaml:"port" json:"port"`
// Model defines the current base model name
Model string `yaml:"model"`
Model string `yaml:"model" json:"model"`
// ServedModelNames is one or many model names exposed by the API
ServedModelNames []string `yaml:"served-model-name"`
ServedModelNames []string `yaml:"served-model-name" json:"served-model-name"`
// MaxLoras defines maximum number of loaded LoRAs
MaxLoras int `yaml:"max-loras"`
MaxLoras int `yaml:"max-loras" json:"max-loras"`
// MaxCPULoras defines maximum number of LoRAs to store in CPU memory
MaxCPULoras int `yaml:"max-cpu-loras"`
MaxCPULoras int `yaml:"max-cpu-loras" json:"max-cpu-loras"`
// MaxNumSeqs is maximum number of sequences per iteration (the maximum
// number of inference requests that could be processed at the same time)
MaxNumSeqs int `yaml:"max-num-seqs"`
MaxNumSeqs int `yaml:"max-num-seqs" json:"max-num-seqs"`
// MaxModelLen is the model's context window, the maximum number of tokens
// in a single request including input and output. Default value is 1024.
MaxModelLen int `yaml:"max-model-len"`
MaxModelLen int `yaml:"max-model-len" json:"max-model-len"`
// LoraModulesString is a list of LoRA adapters as strings
LoraModulesString []string `yaml:"lora-modules"`
LoraModulesString []string `yaml:"lora-modules" json:"lora-modules"`
// LoraModules is a list of LoRA adapters
LoraModules []LoraModule

// TimeToFirstToken time before the first token will be returned, in milliseconds
TimeToFirstToken int `yaml:"time-to-first-token"`
TimeToFirstToken int `yaml:"time-to-first-token" json:"time-to-first-token"`
// TimeToFirstTokenStdDev standard deviation for time before the first token will be returned,
// in milliseconds, optional, default is 0, can't be more than 30% of TimeToFirstToken, will not
// cause the actual time to first token to differ by more than 70% from TimeToFirstToken
TimeToFirstTokenStdDev int `yaml:"time-to-first-token-std-dev"`
TimeToFirstTokenStdDev int `yaml:"time-to-first-token-std-dev" json:"time-to-first-token-std-dev"`
// InterTokenLatency time between generated tokens, in milliseconds
InterTokenLatency int `yaml:"inter-token-latency"`
InterTokenLatency int `yaml:"inter-token-latency" json:"inter-token-latency"`
// InterTokenLatencyStdDev standard deviation for time between generated tokens, in milliseconds,
// optional, default is 0, can't be more than 30% of InterTokenLatency, will not cause the actual
// inter token latency to differ by more than 70% from InterTokenLatency
InterTokenLatencyStdDev int `yaml:"inter-token-latency-std-dev"`
InterTokenLatencyStdDev int `yaml:"inter-token-latency-std-dev" json:"inter-token-latency-std-dev"`
// KVCacheTransferLatency time to "transfer" kv-cache from another vLLM instance in case P/D is activated,
// in milliseconds
KVCacheTransferLatency int `yaml:"kv-cache-transfer-latency"`
KVCacheTransferLatency int `yaml:"kv-cache-transfer-latency" json:"kv-cache-transfer-latency"`
// KVCacheTransferLatencyStdDev standard deviation for time to "transfer" kv-cache from another
// vLLM instance in case P/D is activated, in milliseconds, optional, default is 0, can't be more
// than 30% of KVCacheTransferLatency, will not cause the actual latency to differ by more than 70% from
// KVCacheTransferLatency
KVCacheTransferLatencyStdDev int `yaml:"kv-cache-transfer-latency-std-dev"`
KVCacheTransferLatencyStdDev int `yaml:"kv-cache-transfer-latency-std-dev" json:"kv-cache-transfer-latency-std-dev"`

// Mode defines the simulator response generation mode, valid values: echo, random
Mode string `yaml:"mode"`
Mode string `yaml:"mode" json:"mode"`
// Seed defines random seed for operations
Seed int64 `yaml:"seed"`
Seed int64 `yaml:"seed" json:"seed"`

// MaxToolCallIntegerParam defines the maximum possible value of integer parameters in a tool call,
// optional, defaults to 100
MaxToolCallIntegerParam int `yaml:"max-tool-call-integer-param"`
MaxToolCallIntegerParam int `yaml:"max-tool-call-integer-param" json:"max-tool-call-integer-param"`
// MinToolCallIntegerParam defines the minimum possible value of integer parameters in a tool call,
// optional, defaults to 0
MinToolCallIntegerParam int `yaml:"min-tool-call-integer-param"`
MinToolCallIntegerParam int `yaml:"min-tool-call-integer-param" json:"min-tool-call-integer-param"`
// MaxToolCallNumberParam defines the maximum possible value of number (float) parameters in a tool call,
// optional, defaults to 100
MaxToolCallNumberParam float64 `yaml:"max-tool-call-number-param"`
MaxToolCallNumberParam float64 `yaml:"max-tool-call-number-param" json:"max-tool-call-number-param"`
// MinToolCallNumberParam defines the minimum possible value of number (float) parameters in a tool call,
// optional, defaults to 0
MinToolCallNumberParam float64 `yaml:"min-tool-call-number-param"`
MinToolCallNumberParam float64 `yaml:"min-tool-call-number-param" json:"min-tool-call-number-param"`

// MaxToolCallArrayParamLength defines the maximum possible length of array parameters in a tool call,
// optional, defaults to 5
MaxToolCallArrayParamLength int `yaml:"max-tool-call-array-param-length"`
MaxToolCallArrayParamLength int `yaml:"max-tool-call-array-param-length" json:"max-tool-call-array-param-length"`
// MinToolCallArrayParamLength defines the minimum possible length of array parameters in a tool call,
// optional, defaults to 1
MinToolCallArrayParamLength int `yaml:"min-tool-call-array-param-length"`
MinToolCallArrayParamLength int `yaml:"min-tool-call-array-param-length" json:"min-tool-call-array-param-length"`

// ToolCallNotRequiredParamProbability is the probability to add a parameter, that is not required,
// in a tool call, optional, defaults to 50
ToolCallNotRequiredParamProbability int `yaml:"tool-call-not-required-param-probability"`
ToolCallNotRequiredParamProbability int `yaml:"tool-call-not-required-param-probability" json:"tool-call-not-required-param-probability"`
// ObjectToolCallNotRequiredParamProbability is the probability to add a field, that is not required,
// in an object in a tool call, optional, defaults to 50
ObjectToolCallNotRequiredParamProbability int `yaml:"object-tool-call-not-required-field-probability"`
ObjectToolCallNotRequiredParamProbability int `yaml:"object-tool-call-not-required-field-probability" json:"object-tool-call-not-required-field-probability"`

// EnableKVCache defines if kv cache feature will be enabled
EnableKVCache bool `yaml:"enable-kvcache"`
EnableKVCache bool `yaml:"enable-kvcache" json:"enable-kvcache"`
// KVCacheSize is the maximum number of token blocks in kv cache, the default value is 1024
KVCacheSize int `yaml:"kv-cache-size"`
KVCacheSize int `yaml:"kv-cache-size" json:"kv-cache-size"`

// TokenizersCacheDir is the directory for caching tokenizers
TokenizersCacheDir string `yaml:"tokenizers-cache-dir"`
TokenizersCacheDir string `yaml:"tokenizers-cache-dir" json:"tokenizers-cache-dir"`
// TokenBlockSize is token block size for contiguous chunks of tokens, possible values: 8,16,32,64,128, defaults to 16
TokenBlockSize int `yaml:"block-size"`
TokenBlockSize int `yaml:"block-size" json:"block-size"`
// HashSeed is the seed for hash generation (if not set, is read from PYTHONHASHSEED environment variable)
HashSeed string `yaml:"hash-seed"`
HashSeed string `yaml:"hash-seed" json:"hash-seed"`

// ZMQEndpoint is the ZMQ address to publish events, the default value is tcp://localhost:5557
ZMQEndpoint string `yaml:"zmq-endpoint"`
ZMQEndpoint string `yaml:"zmq-endpoint" json:"zmq-endpoint"`
// ZMQMaxConnectAttempts defines the maximum number (10) of retries when ZMQ connection fails
ZMQMaxConnectAttempts uint `yaml:"zmq-max-connect-attempts"`
ZMQMaxConnectAttempts uint `yaml:"zmq-max-connect-attempts" json:"zmq-max-connect-attempts"`

// EventBatchSize is the maximum number of kv-cache events to be sent together, defaults to 16
EventBatchSize int `yaml:"event-batch-size"`
EventBatchSize int `yaml:"event-batch-size" json:"event-batch-size"`

// FakeMetrics is a set of metrics to send to Prometheus instead of the real data
FakeMetrics *Metrics `yaml:"fake-metrics"`
FakeMetrics *Metrics `yaml:"fake-metrics" json:"fake-metrics"`
}

type Metrics struct {
Expand Down
40 changes: 40 additions & 0 deletions pkg/llm-d-inference-sim/simulator.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ package llmdinferencesim
import (
"context"
"encoding/json"
"errors"
"fmt"
"net"
"os"
Expand Down Expand Up @@ -114,8 +115,14 @@ func (s *VllmSimulator) Start(ctx context.Context) error {
if err != nil {
return err
}

s.config = config

err = s.showConfig(s.logger)
if err != nil {
return err
}

for _, lora := range config.LoraModules {
s.loraAdaptors.Store(lora.Name, "")
}
Expand Down Expand Up @@ -708,3 +715,36 @@ func (s *VllmSimulator) getDisplayedModelName(reqModel string) string {
}
return s.config.ServedModelNames[0]
}

func (s *VllmSimulator) showConfig(tgtLgr logr.Logger) error {
if tgtLgr == logr.Discard() {
return errors.New("target logger is nil, cannot show configuration")
}
cfgJSON, err := json.Marshal(s.config)
if err != nil {
return fmt.Errorf("failed to marshal configuration to JSON: %w", err)
}

// clean LoraModulesString field
var m map[string]interface{}
err = json.Unmarshal(cfgJSON, &m)
if err != nil {
return fmt.Errorf("failed to unmarshal JSON to map: %w", err)
}
m["lora-modules"] = m["LoraModules"]
delete(m, "LoraModules")
delete(m, "LoraModulesString")

// clean fake-metrics field
if field, ok := m["fake-metrics"].(map[string]interface{}); ok {
delete(field, "LorasString")
}

// show in JSON
cfgJSON, err = json.MarshalIndent(m, "", " ")
if err != nil {
return fmt.Errorf("failed to marshal configuration to JSON: %w", err)
}
tgtLgr.Info("Configuration:", "", string(cfgJSON))
return nil
}