llm-d
diff --git a/‎.gitignore‎
Lines changed: 4 additions & 0 deletions b/‎.gitignore‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 1 addition & 1 deletion b/‎Makefile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 14 additions & 1 deletion b/‎README.md‎
Lines changed: 14 additions & 1 deletion
diff --git a/‎go.mod‎
Lines changed: 1 addition & 0 deletions b/‎go.mod‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎go.sum‎
Lines changed: 2 additions & 0 deletions b/‎go.sum‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎pkg/common/config.go‎
Lines changed: 24 additions & 0 deletions b/‎pkg/common/config.go‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎pkg/common/utils.go‎
Lines changed: 0 additions & 263 deletions b/‎pkg/common/utils.go‎
Lines changed: 0 additions & 263 deletions
@@ -7,3 +7,7 @@ vendor
 .DS_Store
 *.test
 manifests/dev-config.yaml
+pkg/dataset/.llm-d
+pkg/llm-d-inference-sim/tests-tmp/
+pkg/llm-d-inference-sim/.llm-d/
+.llm-d/
@@ -85,7 +85,7 @@ format: ## Format Go source files
 test: $(GINKGO) download-tokenizer download-zmq ## Run tests
 	@printf "\033[33;1m==== Running tests ====\033[0m\n"
 ifdef GINKGO_FOCUS
-	CGO_ENABLED=1 $(GINKGO) -ldflags="$(GO_LDFLAGS)" -v -r --focus="$(GINKGO_FOCUS)"
+	CGO_ENABLED=1 ginkgo -ldflags="$(GO_LDFLAGS)" -v -r -- -ginkgo.v -ginkgo.focus="$(GINKGO_FOCUS)"
 else
 	CGO_ENABLED=1 $(GINKGO) -ldflags="$(GO_LDFLAGS)" -v -r
 endif
 
@@ -149,7 +149,20 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
       {"running-requests":10,"waiting-requests":30,"kv-cache-usage":0.4,"loras":[{"running":"lora4,lora2","waiting":"lora3","timestamp":1257894567},{"running":"lora4,lora3","waiting":"","timestamp":1257894569}]}
 ---
 - `data-parallel-size`: number of ranks to run in Data Parallel deployment, from 1 to 8, default is 1. The ports will be assigned as follows: rank 0 will run on the configured `port`, rank 1 on `port`+1, etc.      
-
+---
+- `dataset-path`: Optional local file path to the SQLite database file used for generating responses from a dataset.
+  - If not set, hardcoded preset responses will be used.
+  - If set but the file does not exist the `dataset-url` will be used to download the database to the path specified by `dataset-path`.
+  - If the file exists but is currently occupied by another process, responses will be randomly generated from preset text (the same behavior as if the path were not set).
+  - Responses are retrieved from the dataset by the hash of the conversation history, with a fallback to a random dataset response, constrained by the maximum output tokens and EoS token handling, if no matching history is found.
+  - Refer to [llm-d converted ShareGPT](https://huggingface.co/datasets/hf07397/inference-sim-datasets/blob/0b60737c2dd2c570f486cef2efa7971b02e3efde/README.md) for detailed information on the expected format of the SQLite database file.
+- `dataset-url`: Optional URL for downloading the SQLite database file used for response generation.
+  - This parameter is only used if the `dataset-path` is also set and the file does not exist at that path.
+  - If the file needs to be downloaded, it will be saved to the location specified by `dataset-path`.
+  - If the file already exists at the `dataset-path`, it will not be downloaded again
+  - Example URL `https://huggingface.co/datasets/hf07397/inference-sim-datasets/resolve/91ffa7aafdfd6b3b1af228a517edc1e8f22cd274/huggingface/ShareGPT_Vicuna_unfiltered/conversations.sqlite3`
+- `dataset-in-memory`: If true, the entire dataset will be loaded into memory for faster access. This may require significant memory depending on the size of the dataset. Default is false.
+---
 In addition, as we are using klog, the following parameters are available:
 - `add_dir_header`: if true, adds the file directory to the header of the log messages
 - `alsologtostderr`: log to standard error as well as files (no effect when -logtostderr=true)
 
@@ -45,6 +45,7 @@ require (
 	github.com/json-iterator/go v1.1.12 // indirect
 	github.com/klauspost/compress v1.18.0 // indirect
 	github.com/mailru/easyjson v0.7.7 // indirect
+	github.com/mattn/go-sqlite3 v1.14.32 // direct
 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
 	github.com/modern-go/reflect2 v1.0.2 // indirect
 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
 
@@ -72,6 +72,8 @@ github.com/llm-d/llm-d-kv-cache-manager v0.3.0-rc1 h1:SDLiNrcreDcA9m9wfXAumFARDH
 github.com/llm-d/llm-d-kv-cache-manager v0.3.0-rc1/go.mod h1:tN80/D0Faf6pE2ocwFgTNoCxKPsqdsa2XnjQUqOaZ8Q=
 github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
 github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
+github.com/mattn/go-sqlite3 v1.14.32 h1:JD12Ag3oLy1zQA+BNn74xRgaBbdhbNIDYvQUEuuErjs=
+github.com/mattn/go-sqlite3 v1.14.32/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
 github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
 
@@ -181,6 +181,22 @@ type Configuration struct {
 	SSLKeyFile string `yaml:"ssl-keyfile" json:"ssl-keyfile"`
 	// SelfSignedCerts enables automatic generation of self-signed certificates for HTTPS
 	SelfSignedCerts bool `yaml:"self-signed-certs" json:"self-signed-certs"`
+
+	// DatasetPath Optional local file path to the SQLite database file used for generating responses from a dataset.
+	//   - If not set, hardcoded preset responses will be used.
+	//   - If set but the file does not exist the `dataset-url` will be used to download the database to the path specified by `dataset-path`.
+	//   - If the file exists but is currently occupied by another process, responses will be randomly generated from preset text (the same behavior as if the path were not set).
+	//   - Responses are retrieved from the dataset by the hash of the conversation history, with a fallback to a random dataset response, constrained by the maximum output tokens and EoS token handling, if no matching history is found.
+	//   - Refer to [llm-d converted ShareGPT](https://huggingface.co/datasets/hf07397/inference-sim-datasets/blob/0b7ac1a4daf0aace1556326964bd75633372299e/README.md) for detailed information on the expected format of the SQLite database file.
+	DatasetPath string `yaml:"dataset-path" json:"dataset-path"`
+	// DatasetURL Optional URL for downloading the SQLite database file used for response generation.
+	//   - This parameter is only used if the `dataset-path` is also set and the file does not exist at that path.
+	//   - If the file needs to be downloaded, it will be saved to the location specified by `dataset-path`.
+	//   - If the file already exists at the `dataset-path`, it will not be downloaded again
+	//   - Example URL `https://huggingface.co/datasets/hf07397/inference-sim-datasets/resolve/91ffa7aafdfd6b3b1af228a517edc1e8f22cd274/huggingface/ShareGPT_Vicuna_unfiltered/conversations.sqlite3`
+	DatasetURL string `yaml:"dataset-url" json:"dataset-url"`
+	// DatasetInMemory defines whether to load the entire dataset into memory for faster access.
+	DatasetInMemory bool `yaml:"dataset-in-memory" json:"dataset-in-memory"`
 }
 
 type Metrics struct {
@@ -485,6 +501,10 @@ func (c *Configuration) validate() error {
 		return errors.New("cannot use both self-signed-certs and explicit ssl-certfile/ssl-keyfile")
 	}
 
+	if c.DatasetPath == "" && c.DatasetURL != "" {
+		return errors.New("dataset-path is required when dataset-url is set")
+	}
+
 	return nil
 }
 
@@ -564,6 +584,10 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
 	f.IntVar(&config.EventBatchSize, "event-batch-size", config.EventBatchSize, "Maximum number of kv-cache events to be sent together")
 	f.IntVar(&config.DPSize, "data-parallel-size", config.DPSize, "Number of ranks to run")
 
+	f.StringVar(&config.DatasetPath, "dataset-path", config.DatasetPath, "Local path to the sqlite db file for response generation from a dataset")
+	f.StringVar(&config.DatasetURL, "dataset-url", config.DatasetURL, "URL to download the sqlite db file for response generation from a dataset")
+	f.BoolVar(&config.DatasetInMemory, "dataset-in-memory", config.DatasetInMemory, "Load the entire dataset into memory for faster access")
+
 	f.IntVar(&config.FailureInjectionRate, "failure-injection-rate", config.FailureInjectionRate, "Probability (0-100) of injecting failures")
 	failureTypes := getParamValueFromArgs("failure-types")
 	var dummyFailureTypes multiString
 
@@ -17,82 +17,13 @@ limitations under the License.
 package common
 
 import (
-	"fmt"
-	"math"
 	"math/rand"
 	"regexp"
-	"strings"
 	"sync"
 
 	"github.com/google/uuid"
 )
 
-const (
-	ResponseLenMax              = 128
-	responseLenMean             = 40
-	responseLenStddev           = 20
-	stopFinishReasonProbability = 0.8
-
-	StopFinishReason         = "stop"
-	LengthFinishReason       = "length"
-	ToolsFinishReason        = "tool_calls"
-	RemoteDecodeFinishReason = "remote_decode"
-)
-
-// this array defines the probabilities for the buckets to be used for the generation of number of tokens in response
-var respLenBucketsProbabilities = [...]float64{0.2, 0.3, 0.2, 0.05, 0.1, 0.15}
-var cumulativeBucketsProbabilities []float64
-
-const (
-	flexBucketIndex    = 3
-	maxFixedBucketSize = 20
-)
-
-// list of responses to use in random mode for comepltion requests
-var chatCompletionFakeResponses = []string{
-	`Testing@, #testing 1$ ,2%,3^, [4&*5], 6~, 7-_ + (8 : 9) / \ < > .`,
-	`Testing, testing 1,2,3.`,
-	`I am fine, how are you today?`,
-	`I am your AI assistant, how can I help you today?`,
-	`Today is a nice sunny day.`,
-	`The temperature here is twenty-five degrees centigrade.`,
-	`Today it is partially cloudy and raining.`,
-	`To be or not to be that is the question.`,
-	`Alas, poor Yorick! I knew him, Horatio: A fellow of infinite jest`,
-	`The rest is silence. `,
-	`Give a man a fish and you feed him for a day; teach a man to fish and you feed him for a lifetime`,
-}
-
-func init() {
-	cumulativeBucketsProbabilities = make([]float64, len(respLenBucketsProbabilities))
-	sum := 0.0
-
-	for i, val := range respLenBucketsProbabilities {
-		sum += val
-		cumulativeBucketsProbabilities[i] = sum
-	}
-}
-
-// returns the max tokens or error if incorrect
-func GetMaxTokens(maxCompletionTokens *int64, maxTokens *int64) (*int64, error) {
-	var typeToken string
-	var tokens *int64
-	// if both arguments are passed,
-	// use maxCompletionTokens
-	// as in the real vllm
-	if maxCompletionTokens != nil {
-		tokens = maxCompletionTokens
-		typeToken = "max_completion_tokens"
-	} else if maxTokens != nil {
-		tokens = maxTokens
-		typeToken = "max_tokens"
-	}
-	if tokens != nil && *tokens < 1 {
-		return nil, fmt.Errorf("%s must be at least 1, got %d", typeToken, *tokens)
-	}
-	return tokens, nil
-}
-
 // ValidateContextWindow checks if the request fits within the model's context window
 // Returns validation result, actual completion tokens, and total tokens
 func ValidateContextWindow(promptTokens int, maxCompletionTokens *int64, maxModelLen int) (bool, int64, int64) {
@@ -107,200 +38,6 @@ func ValidateContextWindow(promptTokens int, maxCompletionTokens *int64, maxMode
 	return isValid, completionTokens, totalTokens
 }
 
-// GetRandomResponseLen returns int in range [1, responseLenMax]
-// numbers are chosen according a gaussian distribution with mean responseLenMean, and standard deviation responseLenStddev
-func GetRandomResponseLen() int {
-	for {
-		val := rand.NormFloat64()*responseLenStddev + responseLenMean
-		if val >= 1 && val <= ResponseLenMax {
-			return int(math.Round(val))
-		}
-		// else reject and resample
-	}
-}
-
-// GetRandomFinishReason returns finish reason with the probability for 'stop' as defined by stopFinishReasonProbability
-func GetRandomFinishReason() string {
-	if rand.Float64() < stopFinishReasonProbability {
-		return StopFinishReason
-	}
-	return LengthFinishReason
-}
-
-// GetRandomText generates random text for the required number of tokens,
-// select randomly a sentence from chatCompletionFakeResponses,
-// if number of tokens is lower than required - select another sentence,
-// continue until the required number of tokens is achieved
-func GetRandomText(numOfTokens int) string {
-	allTokens := make([]string, 0)
-
-	for len(allTokens) < numOfTokens {
-		index := RandomInt(0, len(chatCompletionFakeResponses)-1)
-		// create tokens from text, splitting by spaces and special characters
-		tokens := Tokenize(chatCompletionFakeResponses[index])
-		remaining := numOfTokens - len(allTokens)
-
-		if len(tokens) > remaining {
-			// there is too many tokens, append only the relevant part
-			tokens = tokens[:remaining]
-		}
-
-		if len(allTokens) > 0 {
-			// for not first sentences add space to the first token to separate between sentences without adding an additional token
-			tokens[0] = " " + tokens[0]
-		}
-
-		allTokens = append(allTokens, tokens...)
-	}
-
-	// return all tokens as text
-	return strings.Join(allTokens, "")
-}
-
-// GetRandomResponseText generates text to be returned in a response, and the finish reason (stop or length)
-// if maxCompletionTokens is defined
-// - currently, the generated number of words in the text will be equal to it value
-// - in future - need to find statistics about generated tokens distribution and return less tokens in part os requests
-// - finish reason will be chosen randomly from the collection (stop, length) with 80% for stop and 20% for length
-// if maxCompletionTokens is nil
-// - the response text's length is randomly chosen from the range [1, responseLenMax] according additional parameters
-// - finish reason is stop
-// if ignore_eos is true - the response will be generated with exactly maxCompletionTokens tokens
-// - request was validated so that when ignore_eos is true, maxCompletionTokens must be defined
-func GetRandomResponseText(maxCompletionTokens *int64, ignore_eos bool) (string, string) {
-	numOfTokens := 0
-	finishReason := StopFinishReason
-
-	// no max completion tokens, return text with random length
-	if maxCompletionTokens == nil {
-		numOfTokens = GetRandomResponseLen()
-	} else {
-		maxTokens := int(*maxCompletionTokens)
-		if ignore_eos {
-			numOfTokens = maxTokens
-			finishReason = LengthFinishReason
-		} else {
-			// max tokens is defined - generate real length of the response based on it
-			numOfTokens = getResponseLengthByHistogram(maxTokens)
-			if numOfTokens == maxTokens {
-				// if response should be create with maximum number of tokens - finish reason will be 'length'
-				finishReason = LengthFinishReason
-			}
-		}
-	}
-
-	text := GetRandomText(numOfTokens)
-	return text, finishReason
-}
-
-// getResponseLengthByHistogram calculates the number of tokens to be returned in a response based on the max tokens value and the pre-defined buckets.
-// The response length is distributed according to the probabilities, defined in respLenBucketsProbabilities.
-// The histogram contains equally sized buckets and the last special bucket, which contains only the maxTokens value.
-// The last element of respLenBucketsProbabilities defines the probability of a reposnse with maxToken tokens.
-// Other values define probabilities for the equally sized buckets.
-// If maxToken is small (smaller than number of buckets) - the response length is randomly selected from the range [1, maxTokens]
-func getResponseLengthByHistogram(maxTokens int) int {
-	if maxTokens <= 1 {
-		return maxTokens
-	}
-	// maxTokens is small - no need to use the histogram of probabilities, just select a random value in the range [1, maxTokens]
-	if maxTokens <= len(cumulativeBucketsProbabilities) {
-		res := RandomInt(1, maxTokens)
-		return res
-	}
-
-	r := RandomFloat(0, 1)
-
-	// check if r is in the last bucket, then maxTokens should be returned
-	if r > cumulativeBucketsProbabilities[len(cumulativeBucketsProbabilities)-2] {
-		return maxTokens
-	}
-
-	// determine which bucket to use, the bucket with a cumulative probability larger than r is the bucket to use
-	// initialize bucketIndex with the last bucket to handle the case (which should not happen) when the probabilities sum is less than 1
-	bucketIndex := len(cumulativeBucketsProbabilities) - 1
-	for i, c := range cumulativeBucketsProbabilities {
-		if r <= c {
-			bucketIndex = i
-			break
-		}
-	}
-
-	// calculate the size of all of the buckets (except the special last bucket)
-	start, end := calcBucketBoundaries(maxTokens, bucketIndex)
-
-	// pick uniformly within the bucket’s range
-	return RandomInt(start, end)
-}
-
-// calcBucketBoundaries calculates boundaries of a bucket with the given index.
-// Maximum size for equally sized buckets is defined by maxFixedBucketSize.
-// [maxFixedBucketSize*(number-of-buckets-1)+1] is the value of maxTokens for which
-// division to equally size buckets will give buckets with size maxFixedBucketSize.
-// If maxTokens is [maxFixedBucketSize*(number-of-buckets-1)+1] or less,
-// all buckets will be of equal size, except the last bucket, which contains only one value.
-// If maxTokens is higher than [maxFixedBucketSize*(number-of-buckets-1)+1],
-// and flexBucketIndex is valid (between 0 and number of buckets - 1) the buckets sizes will not be equal.
-// In this case, all buckets except the one at flexBucketIndex index will have size 20 (and the last is with size 1),
-// and the bucket at flexBucketIndex index will 'stretch' to cover the remaining range.
-func calcBucketBoundaries(maxTokens int, bucketIndex int) (start int, end int) {
-	maxEquallyBucketsSz := maxFixedBucketSize*(len(cumulativeBucketsProbabilities)-1) + 1
-
-	if maxTokens <= maxEquallyBucketsSz || flexBucketIndex < 0 || flexBucketIndex >= len(cumulativeBucketsProbabilities)-1 {
-		// create equally size buckets
-		// calculate the size of all of the buckets (except the special last bucket)
-		bucketSize := float64(maxTokens-1) / float64(len(cumulativeBucketsProbabilities)-1)
-		start = int(bucketSize*float64(bucketIndex)) + 1
-		end = int(bucketSize * float64(bucketIndex+1))
-	} else {
-		// create non-equally sized buckets and find boundaries of the required bucket
-		if bucketIndex < flexBucketIndex {
-			// the relevant bucket is before the flex bucket, all buckets are of the same size (maxFixedBucketSize)
-			// start is the minimum number in the required bucket
-			start = maxFixedBucketSize*bucketIndex + 1
-			end = maxFixedBucketSize * (bucketIndex + 1)
-		} else {
-			flexBucketSize := maxTokens - (maxFixedBucketSize * (len(cumulativeBucketsProbabilities) - 2))
-
-			if bucketIndex == flexBucketIndex {
-				// the relevant bucket is the flex bucket
-				start = int(maxFixedBucketSize*float64(bucketIndex)) + 1
-				end = maxFixedBucketSize*bucketIndex + flexBucketSize
-			} else {
-				// the relevant bucket is one of buckets after the flex bucket
-				start = int(maxFixedBucketSize*float64(bucketIndex-1)) + flexBucketSize + 1
-				end = maxFixedBucketSize*bucketIndex + flexBucketSize
-			}
-		}
-	}
-
-	// sometimes end could be maxTokens because of rounding, change the value to maxToken-1
-	if end >= maxTokens {
-		end = maxTokens - 1
-	}
-
-	return start, end
-}
-
-// GetResponseText returns response text, from a given text
-// considering max completion tokens if it is not nil, and a finish reason (stop or length)
-func GetResponseText(maxCompletionTokens *int64, text string) (string, string) {
-	// no max completion tokens, return entire text
-	if maxCompletionTokens == nil {
-		return text, StopFinishReason
-	}
-
-	// create tokens from text, splitting by spaces
-	tokens := Tokenize(text)
-
-	// return entire text
-	if *maxCompletionTokens >= int64(len(tokens)) {
-		return text, StopFinishReason
-	}
-	// return truncated text
-	return strings.Join(tokens[0:*maxCompletionTokens], " "), LengthFinishReason
-}
-
 func RandomNumericString(length int) string {
 	digits := "0123456789"
 	result := make([]byte, length)