llm-d
diff --git a/‎Makefile
Lines changed: 20 additions & 0 deletions b/‎Makefile
Lines changed: 20 additions & 0 deletions
diff --git a/‎examples/kv_cache_index/main.go
Lines changed: 2 additions & 2 deletions b/‎examples/kv_cache_index/main.go
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/kv_events/offline/main.go
Lines changed: 3 additions & 3 deletions b/‎examples/kv_events/offline/main.go
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/kv_events/online/main.go
Lines changed: 1 addition & 1 deletion b/‎examples/kv_events/online/main.go
Lines changed: 1 addition & 1 deletion
diff --git a/‎pkg/kvcache/indexer.go
Lines changed: 47 additions & 1 deletion b/‎pkg/kvcache/indexer.go
Lines changed: 47 additions & 1 deletion
diff --git a/‎pkg/tokenization/chat_template_go/README_CHATCOMPLETIONS_KVCACHE.md
Lines changed: 158 additions & 0 deletions b/‎pkg/tokenization/chat_template_go/README_CHATCOMPLETIONS_KVCACHE.md
Lines changed: 158 additions & 0 deletions
diff --git a/‎pkg/tokenization/chat_template_go/__pycache__/chat_template_wrapper.cpython-311.pyc
25.7 KB b/‎pkg/tokenization/chat_template_go/__pycache__/chat_template_wrapper.cpython-311.pyc
25.7 KB
@@ -23,8 +23,12 @@ help: ## Print help
 	@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n  make \033[36m<target>\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf "  \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)
 
 ##@ Tokenizer & Linking
+##
 
+
+##
 LDFLAGS ?= -extldflags '-L$(shell pwd)/lib'
+#LDFLAGS ?= -extldflags '-L$(shell pwd)/lib'
 CGO_ENABLED=1
 TOKENIZER_LIB = lib/libtokenizers.a
 TOKENIZER_RELEASE = v1.20.2
@@ -83,6 +87,22 @@ e2e-test: download-tokenizer
 	@printf "\033[33;1m==== Running unit tests ====\033[0m\n"
 	go test -v -ldflags="$(LDFLAGS)" ./tests/...
 
+.PHONY: validate-chat-templates
+validate-chat-templates: ## Run chat template validation tests against vLLM
+	@printf "\033[33;1m==== Running Chat Template Validation Tests ====\033[0m\n"
+	@echo "Running tests for all models..."
+	python3 scripts/run_chat_template_validation.py --model "TroyDoesAI/Llama-3.1-8B-Instruct"
+
+.PHONY: validate-chat-template
+validate-chat-template: ## Run chat template validation for a specific model (usage: make validate-chat-template MODEL=model-name)
+	@printf "\033[33;1m==== Running Chat Template Validation for $(MODEL) ====\033[0m\n"
+	python3 scripts/run_chat_template_validation.py --model "$(MODEL)" --save
+
+.PHONY: validate-chat-template-default
+validate-chat-template-default: ## Run chat template validation for default model
+	@printf "\033[33;1m==== Running Chat Template Validation for Default Model ====\033[0m\n"
+	python3 scripts/run_chat_template_validation.py --model "OpenAI-ChatGPT/ChatGPT-4-Micro"
+
 ##@ Build
 
 .PHONY: build
 
@@ -115,7 +115,7 @@ func runPrompts(ctx context.Context, kvCacheIndexer *kvcache.Indexer) error {
 	logger.Info("Started Indexer", "model", modelName)
 
 	// Get pods for the prompt
-	pods, err := kvCacheIndexer.GetPodScores(ctx, testdata.Prompt, modelName, nil)
+	pods, err := kvCacheIndexer.GetPodScores(ctx, testdata.Prompt, modelName, nil, false)
 	if err != nil {
 		return err
 	}
@@ -136,7 +136,7 @@ func runPrompts(ctx context.Context, kvCacheIndexer *kvcache.Indexer) error {
 	time.Sleep(3 * time.Second)
 
 	// Get pods for the prompt
-	pods, err = kvCacheIndexer.GetPodScores(ctx, testdata.Prompt, modelName, nil)
+	pods, err = kvCacheIndexer.GetPodScores(ctx, testdata.Prompt, modelName, nil, false)
 	if err != nil {
 		return err
 	}
 
@@ -152,7 +152,7 @@ func runEventsDemo(ctx context.Context, kvCacheIndexer *kvcache.Indexer, publish
 	logger.Info("@@@ Starting KV Events Demo", "model", testdata.ModelName)
 
 	// Initial query - should be empty since no events have been published
-	pods, err := kvCacheIndexer.GetPodScores(ctx, testdata.Prompt, testdata.ModelName, nil)
+	pods, err := kvCacheIndexer.GetPodScores(ctx, testdata.Prompt, testdata.ModelName, nil, false)
 	if err != nil {
 		return err
 	}
@@ -185,7 +185,7 @@ func runEventsDemo(ctx context.Context, kvCacheIndexer *kvcache.Indexer, publish
 	time.Sleep(3 * time.Second)
 
 	// Query again to see the effect of the events
-	pods, err = kvCacheIndexer.GetPodScores(ctx, testdata.Prompt, testdata.ModelName, nil)
+	pods, err = kvCacheIndexer.GetPodScores(ctx, testdata.Prompt, testdata.ModelName, nil, false)
 	if err != nil {
 		return err
 	}
@@ -214,7 +214,7 @@ func runEventsDemo(ctx context.Context, kvCacheIndexer *kvcache.Indexer, publish
 	time.Sleep(3 * time.Second)
 
 	// Final query
-	pods, err = kvCacheIndexer.GetPodScores(ctx, testdata.Prompt, testdata.ModelName, nil)
+	pods, err = kvCacheIndexer.GetPodScores(ctx, testdata.Prompt, testdata.ModelName, nil, false)
 	if err != nil {
 		return err
 	}
 
@@ -147,7 +147,7 @@ func main() {
 			return
 		}
 
-		pods, err := kvCacheIndexer.GetPodScores(ctx, req.Prompt, modelName, nil)
+		pods, err := kvCacheIndexer.GetPodScores(ctx, req.Prompt, modelName, nil, false)
 		if err != nil {
 			http.Error(w, fmt.Sprintf("error: %v", err), http.StatusInternalServerError)
 			return
 
@@ -18,13 +18,15 @@ package kvcache
 
 import (
 	"context"
+	"encoding/json"
 	"fmt"
 
 	"k8s.io/apimachinery/pkg/util/sets"
 	"k8s.io/klog/v2"
 
 	"github.com/llm-d/llm-d-kv-cache-manager/pkg/kvcache/kvblock"
 	"github.com/llm-d/llm-d-kv-cache-manager/pkg/tokenization"
+	chattemplatego "github.com/llm-d/llm-d-kv-cache-manager/pkg/tokenization/chat_template_go"
 	"github.com/llm-d/llm-d-kv-cache-manager/pkg/tokenization/prefixstore"
 	"github.com/llm-d/llm-d-kv-cache-manager/pkg/utils/logging"
 )
@@ -115,14 +117,50 @@ func (k *Indexer) KVBlockIndex() kvblock.Index {
 //
 // The function returns a map of pod identifiers to scores.
 func (k *Indexer) GetPodScores(ctx context.Context, prompt, modelName string,
-	podIdentifiers []string,
+	podIdentifiers []string, chatCompletion bool,
 ) (map[string]int, error) {
 	traceLogger := klog.FromContext(ctx).V(logging.TRACE).WithName("kvcache.GetPodScores")
+
+	// Handle chat completion requests
+	if chatCompletion {
+		// Parse the prompt as a ChatTemplateRequest JSON
+		var req chattemplatego.ChatTemplateRequest
+		if err := json.Unmarshal([]byte(prompt), &req); err != nil {
+			return nil, fmt.Errorf("failed to parse chat template request: %w", err)
+		}
+
+		// Create or reuse the CGo wrapper (could be a singleton in production)
+		// TODO: cache, instance management
+		wrapper := chattemplatego.NewChatTemplateCGoWrapper()
+
+		// Fetch the chat template for the model (if not already set)
+		if req.ChatTemplate == "" {
+			getReq := chattemplatego.GetChatTemplateRequest{ModelName: modelName}
+			template, template_vars, err := wrapper.GetModelChatTemplate(getReq)
+			if err != nil {
+				return nil, fmt.Errorf("failed to fetch chat template: %w", err)
+			}
+			req.ChatTemplate = template
+			req.TemplateVars = template_vars
+		}
+
+		// Apply the template to the request
+		resp, err := wrapper.RenderChatTemplate(req)
+		if err != nil {
+			return nil, fmt.Errorf("failed to render chat template: %w", err)
+		}
+		if len(resp.RenderedChats) == 0 {
+			return nil, nil
+		}
+		prompt = resp.RenderedChats[0]
+	}
+
 	// 0. add to tokenizers pool
 	k.tokenizersPool.AddTask(prompt, modelName)
 
 	// 1. get available tokens of longest prefix
 	tokens := k.tokensIndexer.FindLongestContainedTokens(prompt, modelName)
+
 	if len(tokens) == 0 {
 		//nolint:nilnil // no need to return an error
 		return nil, nil
@@ -150,6 +188,14 @@ func (k *Indexer) GetPodScores(ctx context.Context, prompt, modelName string,
 	return podScores, nil
 }
 
+// GetPodScoresDefault is a convenience function for backward compatibility
+// that calls GetPodScores with chatCompletion=false
+func (k *Indexer) GetPodScoresDefault(ctx context.Context, prompt, modelName string,
+	podIdentifiers []string,
+) (map[string]int, error) {
+	return k.GetPodScores(ctx, prompt, modelName, podIdentifiers, false)
+}
+
 // podsPerKeyPrintHelper formats a map of keys to pod names for printing.
 func podsPerKeyPrintHelper(ks map[kvblock.Key][]string) string {
 	flattened := ""
 
@@ -0,0 +1,158 @@
+# HOWTO: Using `GetCompletionsPodScores` for OpenAI-API ChatCompletions Requests with kv-cache-manager
+
+## Overview
+
+`GetCompletionsPodScores` in `indexer.go` enables the kv-cache-manager to support OpenAI-compatible ChatCompletions requests by rendering the full message structure (including tools and documents) into a prompt using a Python Jinja2 template, before tokenization and KV block key calculation.
+
+---
+
+## What struct do I need to receive from the router?
+
+You must provide a `chattemplatego.ChatTemplateRequest` with the following fields:
+
+```go
+// ChatTemplateRequest represents the request to render a chat template
+ type ChatTemplateRequest struct {
+     Conversations             [][]ChatMessage        `json:"conversations"`
+     Tools                     []interface{}          `json:"tools,omitempty"`
+     Documents                 []interface{}          `json:"documents,omitempty"`
+     ChatTemplate              string                 `json:"chat_template,omitempty"`
+     ReturnAssistantTokensMask bool                   `json:"return_assistant_tokens_mask,omitempty"`
+     ContinueFinalMessage      bool                   `json:"continue_final_message,omitempty"`
+     AddGenerationPrompt       bool                   `json:"add_generation_prompt,omitempty"`
+     TemplateVars              map[string]interface{} `json:"template_vars,omitempty"`
+ }
+```
+
+- **Conversations**: List of message lists (role/content pairs)
+- **Tools**: (Optional) List of tool schemas
+- **Documents**: (Optional) List of document dicts
+- **ChatTemplate**: (Optional) Override for the chat template
+- **ReturnAssistantTokensMask**: (Optional) Whether to return assistant token indices
+- **ContinueFinalMessage**: (Optional) Whether to continue from the final message
+- **AddGenerationPrompt**: (Optional) Whether to add a generation prompt
+- **TemplateVars**: (Optional) Special tokens for template rendering
+
+This struct mirrors the OpenAI ChatCompletions request, supporting messages, tools, documents, and advanced template options.
+
+### ChatMessage Struct
+
+The `ChatMessage` struct represents individual messages within conversations:
+
+```go
+// ChatMessage represents a single message in a conversation
+type ChatMessage struct {
+    Role    string `json:"role"`
+    Content string `json:"content"`
+}
+```
+
+- **Role**: The role of the message sender (e.g., "user", "assistant", "system")
+- **Content**: The actual message content/text
+
+**Example usage:**
+```go
+conversation := []chattemplatego.ChatMessage{
+    {Role: "user", Content: "What is the weather in Paris?"},
+    {Role: "assistant", Content: "Let me check that for you."},
+    {Role: "user", Content: "Thank you!"},
+}
+```
+
+This structure follows the OpenAI ChatCompletions API format, making it compatible with existing chat-based applications.
+
+---
+
+## How do the three scoring functions differ?
+
+- **`GetPromptPodScores`**:  
+  Accepts a simple prompt string, tokenizes it, and calculates KV block keys directly.
+
+- **`GetCompletionsPodScores`**:  
+  Accepts a full `ChatTemplateRequest` (with messages, tools, etc.), uses the Python Jinja2 template (via CGO) to flatten the structure into a prompt, then tokenizes and calculates KV block keys. This ensures the prompt matches what the model would actually see.
+
+- **`GetPodScores`**:  
+  A unified interface that automatically dispatches to either `GetPromptPodScores` or `GetCompletionsPodScores` based on the input type:
+  - If input is a `string` → calls `GetPromptPodScores`
+  - If input is a `ChatTemplateRequest` → calls `GetCompletionsPodScores`
+  - This provides a single entry point for both simple prompts and complex chat completions.
+
+---
+
+## Detailed Flow: `GetCompletionsPodScores` Pipeline
+
+When `indexer.go:GetCompletionsPodScores()` is called, here's the complete flow through files and functions:
+
+```
+1. indexer.go:GetCompletionsPodScores(ctx, req, modelName, podIdentifiers)
+   │
+   ├── 1.1. **CGO Binding**: chattemplatego.NewChatTemplateCGoWrapper()
+   │   └── cgo_functions.go:NewChatTemplateCGoWrapper()
+   │       └── Creates ChatTemplateCGoWrapper struct with initialized=false
+   │
+   ├── 1.2. **CGO Binding**: wrapper.GetModelChatTemplate(getReq)
+   │   ├── cgo_functions.go:GetModelChatTemplate(req)
+   │   │   ├── Initialize() Python interpreter via CGO
+   │   │   ├── executePythonCode() - **CGO Binding** to Python
+   │   │   └── **Python Wrapper**: chat_template_wrapper.py:get_model_chat_template()
+   │   │       └── Uses Hugging Face AutoTokenizer to fetch model template
+   │   └── Returns: (template, template_vars)
+   │
+   ├── 1.3. **CGO Binding**: wrapper.RenderChatTemplate(req)
+   │   ├── cgo_functions.go:RenderChatTemplate(req)
+   │   │   ├── Initialize() Python interpreter via CGO (if not already done)
+   │   │   ├── executePythonCode() - **CGO Binding** to Python
+   │   │   └── **Python Wrapper**: chat_template_wrapper.py:render_jinja_template()
+   │   │       ├── _compile_jinja_template() - Compiles Jinja2 template
+   │   │       ├── AssistantTracker class - Tracks assistant token indices
+   │   │       └── Returns: (rendered_chats, generation_indices)
+   │   └── Returns: ChatTemplateResponse
+   │
+   ├── 1.4. Extract prompt from response
+   │   └── prompt := resp.RenderedChats[0]
+   │
+   ├── 1.5. **Tokenization**: k.tokenizersPool.AddTask(prompt, modelName)
+   │   └── tokenization/pool.go:AddTask() - Queues tokenization task
+   │
+   ├── 1.6. **Prefix Store**: k.tokensIndexer.FindLongestContainedTokens(prompt, modelName)
+   │   └── prefixstore/lru-store.go:FindLongestContainedTokens() - Finds cached tokens
+   │
+   ├── 1.7. **Token Processing**: k.tokensProcessor.TokensToKVBlockKeys(tokens, modelName)
+   │   └── kv-cache/token-processor.go:TokensToKVBlockKeys() - Converts tokens to block keys
+   │
+   ├── 1.8. **KV Block Indexing**: k.kvBlockIndexer.GetPodsForKeys(ctx, blockKeys, podSet)
+   │   └── kv-cache/kvblock-indexer.go:GetPodsForKeys() - Queries Redis for pod mappings
+   │
+   └── 1.9. **Scoring**: k.kvBlockScorer.Score(strBlockKeys, keyToPods)
+       └── kv-cache/kvblock-scorer.go:Score() - Calculates pod scores
+```
+
+### Key Components in the Pipeline:
+
+**🔗 CGO Bindings** (Go → Python):
+- `cgo_functions.go` - Provides the bridge between Go and Python
+- Uses Python's C API via CGO to call Python functions directly
+- Manages Python interpreter lifecycle (Initialize/Finalize)
+
+**📦 Python Wrapper** (Python → Hugging Face):
+- `chat_template_wrapper.py` - Wraps Hugging Face's complex template system
+- Provides clean API for template rendering and model template fetching
+- Handles Jinja2 compilation, assistant tracking, and error handling
+
+**🔄 Data Flow**:
+1. **Input**: `ChatTemplateRequest` (messages, tools, documents)
+2. **Template Fetching**: Model-specific chat template from Hugging Face
+3. **Template Rendering**: Jinja2 template processing with tools/documents
+4. **Tokenization**: Convert rendered prompt to tokens
+5. **KV Cache Lookup**: Find cached token blocks and associated pods
+6. **Scoring**: Calculate pod scores based on cache hits
+
+This pipeline ensures that chat completion requests are properly templated, tokenized, and scored against the KV cache, providing accurate pod recommendations for efficient request routing.
+
+---
+
+## Summary
+
+- The router should send a `ChatTemplateRequest` (not just a prompt string) to the indexer.
+- `GetCompletionsPodScores` will handle template rendering and tokenization internally, ensuring correct KV block key calculation for all supported models.
+- The integration uses a CGO bridge (`cgo_functions.go`) to call Python (`chat_template_wrapper.py`) for template rendering, matching vLLM and OpenAI API behavior.
Original file line number	Diff line number	Diff line change
`@@ -147,7 +147,7 @@ func main() {`
`147`	`147`	`return`
`148`	`148`	`}`
`149`	`149`
`150`		`- pods, err := kvCacheIndexer.GetPodScores(ctx, req.Prompt, modelName, nil)`
	`150`	`+ pods, err := kvCacheIndexer.GetPodScores(ctx, req.Prompt, modelName, nil, false)`
`151`	`151`	`if err != nil {`
`152`	`152`	`http.Error(w, fmt.Sprintf("error: %v", err), http.StatusInternalServerError)`
`153`	`153`	`return`