Generate tokens instead of strings

pancak3 · pancak3 · commit c09b6087fb6d · 2025-09-16T20:46:48.000+10:00
Signed-off-by: Qifan Deng &lt;dev.llmd@qifand.com&gt;
diff --git a/pkg/common/utils.go b/pkg/common/utils.go
@@ -17,7 +17,6 @@ limitations under the License.
 package common
 
 import (
-	"fmt"
 	"math"
 	"math/rand"
 	"regexp"
@@ -73,26 +72,6 @@ func init() {
 	}
 }
 
-// returns the max tokens or error if incorrect
-func GetMaxTokens(maxCompletionTokens *int64, maxTokens *int64) (*int64, error) {
-	var typeToken string
-	var tokens *int64
-	// if both arguments are passed,
-	// use maxCompletionTokens
-	// as in the real vllm
-	if maxCompletionTokens != nil {
-		tokens = maxCompletionTokens
-		typeToken = "max_completion_tokens"
-	} else if maxTokens != nil {
-		tokens = maxTokens
-		typeToken = "max_tokens"
-	}
-	if tokens != nil && *tokens < 1 {
-		return nil, fmt.Errorf("%s must be at least 1, got %d", typeToken, *tokens)
-	}
-	return tokens, nil
-}
-
 // ValidateContextWindow checks if the request fits within the model's context window
 // Returns validation result, actual completion tokens, and total tokens
 func ValidateContextWindow(promptTokens int, maxCompletionTokens *int64, maxModelLen int) (bool, int64, int64) {
@@ -157,7 +136,7 @@ func GetRandomText(numOfTokens int) string {
 	return strings.Join(allTokens, "")
 }
 
-// GetRandomResponseText generates text to be returned in a response, and the finish reason (stop or length)
+// GetRandomTokens generates tokens to be returned in a response, and the finish reason (stop or length)
 // if maxCompletionTokens is defined
 // - currently, the generated number of words in the text will be equal to it value
 // - in future - need to find statistics about generated tokens distribution and return less tokens in part os requests
@@ -167,7 +146,7 @@ func GetRandomText(numOfTokens int) string {
 // - finish reason is stop
 // if ignore_eos is true - the response will be generated with exactly maxCompletionTokens tokens
 // - request was validated so that when ignore_eos is true, maxCompletionTokens must be defined
-func GetRandomResponseText(maxCompletionTokens *int64, ignore_eos bool) (string, string) {
+func GetRandomTokens(maxCompletionTokens *int64, ignore_eos bool) ([]string, string) {
 	numOfTokens := 0
 	finishReason := StopFinishReason
 
@@ -189,8 +168,7 @@ func GetRandomResponseText(maxCompletionTokens *int64, ignore_eos bool) (string,
 		}
 	}
 
-	text := GetRandomText(numOfTokens)
-	return text, finishReason
+	return Tokenize(GetRandomText(numOfTokens)), finishReason
 }
 
 // getResponseLengthByHistogram calculates the number of tokens to be returned in a response based on the max tokens value and the pre-defined buckets.
@@ -282,23 +260,20 @@ func calcBucketBoundaries(maxTokens int, bucketIndex int) (start int, end int) {
 	return start, end
 }
 
-// GetResponseText returns response text, from a given text
+// GetResponseTokens returns needed tokens, from a given text
 // considering max completion tokens if it is not nil, and a finish reason (stop or length)
-func GetResponseText(maxCompletionTokens *int64, text string) (string, string) {
+func GetResponseTokens(maxCompletionTokens *int64, text string) ([]string, string) {
+	tokens := Tokenize(text)
 	// no max completion tokens, return entire text
 	if maxCompletionTokens == nil {
-		return text, StopFinishReason
+		return tokens, StopFinishReason
 	}
 
-	// create tokens from text, splitting by spaces
-	tokens := Tokenize(text)
-
-	// return entire text
 	if *maxCompletionTokens >= int64(len(tokens)) {
-		return text, StopFinishReason
+		return tokens, StopFinishReason
 	}
 	// return truncated text
-	return strings.Join(tokens[0:*maxCompletionTokens], " "), LengthFinishReason
+	return tokens[0:*maxCompletionTokens], LengthFinishReason
 }
 
 func RandomNumericString(length int) string {
diff --git a/pkg/common/utils_test.go b/pkg/common/utils_test.go
@@ -18,6 +18,7 @@ package common
 
 import (
 	"fmt"
+	"strings"
 	"time"
 
 	. "github.com/onsi/ginkgo/v2"
@@ -29,16 +30,17 @@ var _ = Describe("Utils", Ordered, func() {
 		InitRandom(time.Now().UnixNano())
 	})
 
-	Context("GetRandomResponseText", func() {
+	Context("GetRandomTokens", func() {
 		It("should return complete text", func() {
-			text, finishReason := GetRandomResponseText(nil, false)
+			tokens, finishReason := GetRandomTokens(nil, false)
+			text := strings.Join(tokens, "")
 			Expect(IsValidText(text)).To(BeTrue())
 			Expect(finishReason).Should(Equal(StopFinishReason))
 		})
 		It("should return short text", func() {
 			maxCompletionTokens := int64(2)
-			text, finishReason := GetRandomResponseText(&maxCompletionTokens, false)
-			tokensCnt := int64(len(Tokenize(text)))
+			tokens, finishReason := GetRandomTokens(&maxCompletionTokens, false)
+			tokensCnt := int64(len(tokens))
 			Expect(tokensCnt).Should(BeNumerically("<=", maxCompletionTokens))
 			if tokensCnt == maxCompletionTokens {
 				Expect(finishReason).To(Equal(LengthFinishReason))
@@ -50,9 +52,10 @@ var _ = Describe("Utils", Ordered, func() {
 		It("should return long text", func() {
 			// return required number of tokens although it is higher than ResponseLenMax
 			maxCompletionTokens := int64(ResponseLenMax * 5)
-			text, finishReason := GetRandomResponseText(&maxCompletionTokens, false)
-			tokensCnt := int64(len(Tokenize(text)))
+			tokens, finishReason := GetRandomTokens(&maxCompletionTokens, false)
+			tokensCnt := int64(len(tokens))
 			Expect(tokensCnt).Should(BeNumerically("<=", maxCompletionTokens))
+			text := strings.Join(tokens, "")
 			Expect(IsValidText(text)).To(BeTrue())
 			if tokensCnt == maxCompletionTokens {
 				Expect(finishReason).To(Equal(LengthFinishReason))
@@ -65,8 +68,8 @@ var _ = Describe("Utils", Ordered, func() {
 		DescribeTable("should return exact num of tokens",
 			func(maxCompletionTokens int) {
 				n := int64(maxCompletionTokens)
-				text, finishReason := GetRandomResponseText(&n, true)
-				nGenTokens := int64(len(Tokenize(text)))
+				tokens, finishReason := GetRandomTokens(&n, true)
+				nGenTokens := int64(len(tokens))
 				Expect(nGenTokens).Should(Equal(n))
 				Expect(finishReason).To(Equal(LengthFinishReason))
 			},
@@ -80,24 +83,25 @@ var _ = Describe("Utils", Ordered, func() {
 		)
 	})
 
-	Context("GetResponseText", func() {
+	Context("GetResponseTokens", func() {
 		theText := "Give a man a fish and you feed him for a day; teach a man to fish and you feed him for a lifetime"
+		theTokens := Tokenize(theText)
 
 		It("should return the same text since max tokens is not defined", func() {
-			text, finishReason := GetResponseText(nil, theText)
-			Expect(text).Should(Equal(theText))
+			tokens, finishReason := GetResponseTokens(nil, theText)
+			Expect(tokens).Should(Equal(theTokens))
 			Expect(finishReason).Should(Equal(StopFinishReason))
 		})
 		It("should return the same text since max tokens is higher than the text length", func() {
 			maxCompletionTokens := int64(1000)
-			text, finishReason := GetResponseText(&maxCompletionTokens, theText)
-			Expect(text).Should(Equal(theText))
+			tokens, finishReason := GetResponseTokens(&maxCompletionTokens, theText)
+			Expect(tokens).Should(Equal(theTokens))
 			Expect(finishReason).Should(Equal(StopFinishReason))
 		})
 		It("should return partial text", func() {
 			maxCompletionTokens := int64(2)
-			text, finishReason := GetResponseText(&maxCompletionTokens, theText)
-			Expect(int64(len(Tokenize(text)))).Should(Equal(maxCompletionTokens))
+			tokens, finishReason := GetResponseTokens(&maxCompletionTokens, theText)
+			Expect(int64(len(tokens))).Should(Equal(maxCompletionTokens))
 			Expect(finishReason).Should(Equal(LengthFinishReason))
 		})
 	})
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
@@ -599,9 +599,8 @@ func (s *VllmSimulator) GetInterTokenLatency() int {
 }
 
 // generateTokens creates and returns response payload based on this request,
-// i.e., an array of generated tokens, the finish reason, and the number of created tokens
+// i.e., an array of generated tokens, the finish reason, and the number of generated tokens
 func (s *VllmSimulator) generateTokens(req openaiserverapi.CompletionRequest) ([]string, string, int, error) {
-	// if req is ChatCompletionRequest
 	ignoreEOS := req.GetIgnoreEOS()
 	var maxTokens *int64
 	var prompt string
@@ -616,19 +615,13 @@ func (s *VllmSimulator) generateTokens(req openaiserverapi.CompletionRequest) ([
 		return nil, "", 0, fmt.Errorf("unknown request type: %T", req)
 	}
 
-	maxTokensValue, err := common.GetMaxTokens(nil, maxTokens)
-	if err != nil {
-		return nil, "", 0, err
-	}
-
-	var text, finishReason string
+	var finishReason string
+	var tokens []string
 	if s.config.Mode == common.ModeEcho {
-		text, finishReason = common.GetResponseText(maxTokensValue, prompt)
-	} else {
-		text, finishReason = common.GetRandomResponseText(maxTokensValue, ignoreEOS)
+		tokens, finishReason = common.GetResponseTokens(maxTokens, prompt)
+		return tokens, finishReason, len(tokens), nil
 	}
-
-	tokens := common.Tokenize(text)
+	tokens, finishReason = common.GetRandomTokens(maxTokens, ignoreEOS)
 	return tokens, finishReason, len(tokens), nil
 }
 >>>>>>> 48ec8bc (Move token generation to simulator)