Skip to content

Commit f8709bc

Browse files
committed
- reverted API changes: this package is standalone
Signed-off-by: Maroon Ayoub <[email protected]>
1 parent 5db1e8e commit f8709bc

File tree

13 files changed

+158
-319
lines changed

13 files changed

+158
-319
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ help: ## Print help
2323
@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m<target>\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf " \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)
2424

2525
##@ Tokenizer & Linking
26+
2627
LDFLAGS ?= -extldflags '-L$(shell pwd)/lib'
2728
CGO_ENABLED=1
2829
TOKENIZER_LIB = lib/libtokenizers.a
@@ -82,7 +83,6 @@ e2e-test: download-tokenizer
8283
@printf "\033[33;1m==== Running unit tests ====\033[0m\n"
8384
go test -v -ldflags="$(LDFLAGS)" ./tests/...
8485

85-
8686
##@ Build
8787

8888
.PHONY: build

examples/kv_cache_index/main.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ func runPrompts(ctx context.Context, kvCacheIndexer *kvcache.Indexer) error {
115115
logger.Info("Started Indexer", "model", modelName)
116116

117117
// Get pods for the prompt
118-
pods, err := kvCacheIndexer.GetPodScores(ctx, testdata.Prompt, modelName, nil, false)
118+
pods, err := kvCacheIndexer.GetPodScores(ctx, testdata.Prompt, modelName, nil)
119119
if err != nil {
120120
return err
121121
}
@@ -136,7 +136,7 @@ func runPrompts(ctx context.Context, kvCacheIndexer *kvcache.Indexer) error {
136136
time.Sleep(3 * time.Second)
137137

138138
// Get pods for the prompt
139-
pods, err = kvCacheIndexer.GetPodScores(ctx, testdata.Prompt, modelName, nil, false)
139+
pods, err = kvCacheIndexer.GetPodScores(ctx, testdata.Prompt, modelName, nil)
140140
if err != nil {
141141
return err
142142
}

examples/kv_events/offline/main.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ func runEventsDemo(ctx context.Context, kvCacheIndexer *kvcache.Indexer, publish
152152
logger.Info("@@@ Starting KV Events Demo", "model", testdata.ModelName)
153153

154154
// Initial query - should be empty since no events have been published
155-
pods, err := kvCacheIndexer.GetPodScores(ctx, testdata.Prompt, testdata.ModelName, nil, false)
155+
pods, err := kvCacheIndexer.GetPodScores(ctx, testdata.Prompt, testdata.ModelName, nil)
156156
if err != nil {
157157
return err
158158
}
@@ -185,7 +185,7 @@ func runEventsDemo(ctx context.Context, kvCacheIndexer *kvcache.Indexer, publish
185185
time.Sleep(3 * time.Second)
186186

187187
// Query again to see the effect of the events
188-
pods, err = kvCacheIndexer.GetPodScores(ctx, testdata.Prompt, testdata.ModelName, nil, false)
188+
pods, err = kvCacheIndexer.GetPodScores(ctx, testdata.Prompt, testdata.ModelName, nil)
189189
if err != nil {
190190
return err
191191
}
@@ -214,7 +214,7 @@ func runEventsDemo(ctx context.Context, kvCacheIndexer *kvcache.Indexer, publish
214214
time.Sleep(3 * time.Second)
215215

216216
// Final query
217-
pods, err = kvCacheIndexer.GetPodScores(ctx, testdata.Prompt, testdata.ModelName, nil, false)
217+
pods, err = kvCacheIndexer.GetPodScores(ctx, testdata.Prompt, testdata.ModelName, nil)
218218
if err != nil {
219219
return err
220220
}

examples/kv_events/online/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ func main() {
147147
return
148148
}
149149

150-
pods, err := kvCacheIndexer.GetPodScores(ctx, req.Prompt, modelName, nil, false)
150+
pods, err := kvCacheIndexer.GetPodScores(ctx, req.Prompt, modelName, nil)
151151
if err != nil {
152152
http.Error(w, fmt.Sprintf("error: %v", err), http.StatusInternalServerError)
153153
return

pkg/kvcache/indexer.go

Lines changed: 1 addition & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,13 @@ package kvcache
1818

1919
import (
2020
"context"
21-
"encoding/json"
2221
"fmt"
2322

2423
"k8s.io/apimachinery/pkg/util/sets"
2524
"k8s.io/klog/v2"
2625

2726
"github.com/llm-d/llm-d-kv-cache-manager/pkg/kvcache/kvblock"
2827
"github.com/llm-d/llm-d-kv-cache-manager/pkg/tokenization"
29-
chattemplatego "github.com/llm-d/llm-d-kv-cache-manager/pkg/tokenization/chat_template_go"
3028
"github.com/llm-d/llm-d-kv-cache-manager/pkg/tokenization/prefixstore"
3129
"github.com/llm-d/llm-d-kv-cache-manager/pkg/utils/logging"
3230
)
@@ -117,50 +115,14 @@ func (k *Indexer) KVBlockIndex() kvblock.Index {
117115
//
118116
// The function returns a map of pod identifiers to scores.
119117
func (k *Indexer) GetPodScores(ctx context.Context, prompt, modelName string,
120-
podIdentifiers []string, chatCompletion bool,
118+
podIdentifiers []string,
121119
) (map[string]int, error) {
122120
traceLogger := klog.FromContext(ctx).V(logging.TRACE).WithName("kvcache.GetPodScores")
123-
124-
// Handle chat completion requests
125-
if chatCompletion {
126-
// Parse the prompt as a ChatTemplateRequest JSON
127-
var req chattemplatego.ChatTemplateRequest
128-
if err := json.Unmarshal([]byte(prompt), &req); err != nil {
129-
return nil, fmt.Errorf("failed to parse chat template request: %w", err)
130-
}
131-
132-
// Create or reuse the CGo wrapper (could be a singleton in production)
133-
// TODO: cache, instance management
134-
wrapper := chattemplatego.NewChatTemplateCGoWrapper()
135-
136-
// Fetch the chat template for the model (if not already set)
137-
if req.ChatTemplate == "" {
138-
getReq := chattemplatego.GetChatTemplateRequest{ModelName: modelName}
139-
template, template_vars, err := wrapper.GetModelChatTemplate(getReq)
140-
if err != nil {
141-
return nil, fmt.Errorf("failed to fetch chat template: %w", err)
142-
}
143-
req.ChatTemplate = template
144-
req.TemplateVars = template_vars
145-
}
146-
147-
// Apply the template to the request
148-
resp, err := wrapper.RenderChatTemplate(req)
149-
if err != nil {
150-
return nil, fmt.Errorf("failed to render chat template: %w", err)
151-
}
152-
if len(resp.RenderedChats) == 0 {
153-
return nil, nil
154-
}
155-
prompt = resp.RenderedChats[0]
156-
}
157-
158121
// 0. add to tokenizers pool
159122
k.tokenizersPool.AddTask(prompt, modelName)
160123

161124
// 1. get available tokens of longest prefix
162125
tokens := k.tokensIndexer.FindLongestContainedTokens(prompt, modelName)
163-
164126
if len(tokens) == 0 {
165127
//nolint:nilnil // no need to return an error
166128
return nil, nil
@@ -188,14 +150,6 @@ func (k *Indexer) GetPodScores(ctx context.Context, prompt, modelName string,
188150
return podScores, nil
189151
}
190152

191-
// GetPodScoresDefault is a convenience function for backward compatibility
192-
// that calls GetPodScores with chatCompletion=false
193-
func (k *Indexer) GetPodScoresDefault(ctx context.Context, prompt, modelName string,
194-
podIdentifiers []string,
195-
) (map[string]int, error) {
196-
return k.GetPodScores(ctx, prompt, modelName, podIdentifiers, false)
197-
}
198-
199153
// podsPerKeyPrintHelper formats a map of keys to pod names for printing.
200154
func podsPerKeyPrintHelper(ks map[kvblock.Key][]string) string {
201155
flattened := ""
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
# Chat Template Integration for OpenAI-API v1/chat_completions Compatibility
2+
3+
## Why Templating is Needed
4+
5+
When processing OpenAI ChatCompletions requests, vLLM templates the input before tokenization.
6+
For KV-cache lookups to work correctly, we must replicate this templating process in our indexer.
7+
8+
**Example:**
9+
```json
10+
// Input: ChatCompletions request
11+
{
12+
"messages": [
13+
{"role": "user", "content": "What's 2+2?"},
14+
{"role": "assistant", "content": "Let me calculate that."},
15+
{"role": "user", "content": "Thanks!"}
16+
]
17+
}
18+
```
19+
20+
```jinja2
21+
<!-- Model template (e.g., Llama-2) -->
22+
{% for message in messages %}
23+
{% if message['role'] == 'user' %}
24+
{{ '<s>[INST] ' + message['content'] + ' [/INST]' }}
25+
{% elif message['role'] == 'assistant' %}
26+
{{ message['content'] + '</s>' }}
27+
{% endif %}
28+
{% endfor %}
29+
```
30+
31+
```text
32+
<!-- Flattened prompt the model actually sees -->
33+
<s>[INST] What's 2+2? [/INST]Let me calculate that.</s><s>[INST] Thanks! [/INST]
34+
```
35+
36+
**Without templating**, we'd not be able to recreate the same tokens vLLM will produce, leading to incorrect KV-cache lookups.
37+
38+
## Integration with Existing Pipeline
39+
40+
This package provides a library to be used for templating before using the `kvcache.Indexer` entry point.
41+
42+
### Requirements
43+
44+
The router can receive a standard OpenAI ChatCompletions request and convert it to a JSON string representing our `ChatTemplateRequest`:
45+
46+
**ChatTemplateRequest accepts these fields:**
47+
- `Conversations` - List of message lists (role/content pairs)
48+
- `Tools` - (Optional) List of tool schemas
49+
- `Documents` - (Optional) List of document dicts
50+
- `ChatTemplate` - (Optional) Override for the chat template
51+
- `ReturnAssistantTokensMask` - (Optional) Whether to return assistant token indices
52+
- `ContinueFinalMessage` - (Optional) Whether to continue from the final message
53+
- `AddGenerationPrompt` - (Optional) Whether to add a generation prompt
54+
- `TemplateVars` - (Optional) Special tokens for template rendering
55+
56+
### Template Processing Flow
57+
58+
The templating process (steps 1.1-1.4) handles the conversion from structured request to flattened prompt:
59+
60+
```
61+
1.1. **CGO Binding**: chattemplatego.NewChatTemplateCGoWrapper()
62+
└── cgo_functions.go:NewChatTemplateCGoWrapper()
63+
└── Creates ChatTemplateCGoWrapper struct with initialized=false
64+
65+
1.2. **Template Fetching**: wrapper.GetModelChatTemplate(getReq)
66+
├── cgo_functions.go:GetModelChatTemplate(req)
67+
│ ├── Initialize() Python interpreter via CGO
68+
│ ├── executePythonCode() - **CGO Binding** to Python
69+
│ └── **Python Wrapper**: chat_template_wrapper.py:get_model_chat_template()
70+
│ └── Uses Hugging Face AutoTokenizer to fetch model template
71+
└── Returns: (template, template_vars)
72+
73+
1.3. **Template Rendering**: wrapper.RenderChatTemplate(req)
74+
├── cgo_functions.go:RenderChatTemplate(req)
75+
│ ├── Initialize() Python interpreter via CGO (if not already done)
76+
│ ├── executePythonCode() - **CGO Binding** to Python
77+
│ └── **Python Wrapper**: chat_template_wrapper.py:render_jinja_template()
78+
│ └── Imports render_jinja_template from transformers.utils.chat_template_utils
79+
│ └── Uses transformers library's core template rendering functionality
80+
└── Returns: ChatTemplateResponse
81+
82+
1.4. **Extract Flattened Prompt**
83+
└── prompt := resp.RenderedChats[0]
84+
└── Continue with existing pipeline: Tokenize → KV Block Keys → Pod Scoring
85+
```

pkg/tokenization/chat_template_go/cgo_functions.go renamed to pkg/preprocessing/chat_completions_template/cgo_functions.go

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,28 @@
1-
package chattemplatego
1+
//go:build exclude
2+
3+
/*
4+
Copyright 2025 The llm-d Authors.
5+
6+
Licensed under the Apache License, Version 2.0 (the "License");
7+
you may not use this file except in compliance with the License.
8+
You may obtain a copy of the License at
9+
10+
http://www.apache.org/licenses/LICENSE-2.0
11+
12+
Unless required by applicable law or agreed to in writing, software
13+
distributed under the License is distributed on an "AS IS" BASIS,
14+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
See the License for the specific language governing permissions and
16+
limitations under the License.
17+
*/
18+
19+
package chat_completions_template
220

321
/*
422
// CGo build flags for Python 3.11
5-
// These are platform-specific and may need adjustment for different systems
6-
#cgo CFLAGS: -I/Library/Frameworks/Python.framework/Versions/3.11/include/python3.11
7-
#cgo LDFLAGS: -L/Library/Frameworks/Python.framework/Versions/3.11/lib -lpython3.11
23+
// TODO: proper setup
24+
// #cgo CFLAGS: -I/Library/Frameworks/Python.framework/Versions/3.11/include/python3.11
25+
// #cgo LDFLAGS: -L/Library/Frameworks/Python.framework/Versions/3.11/lib -lpython3.11
826
#include "cgo_functions.h"
927
*/
1028
import "C"

pkg/tokenization/chat_template_go/cgo_functions.h renamed to pkg/preprocessing/chat_completions_template/cgo_functions.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,19 @@
1+
/*
2+
Copyright 2025 The llm-d Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
117
#ifndef CGO_FUNCTIONS_H
218
#define CGO_FUNCTIONS_H
319

pkg/tokenization/chat_template_go/chat_template_wrapper.py renamed to pkg/preprocessing/chat_completions_template/chat_template_wrapper.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,17 @@
1+
# Copyright 2025 The llm-d Authors.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
115
#!/usr/bin/env python3
216
"""
317
Standalone wrapper for render_jinja_template function from transformers.

0 commit comments

Comments
 (0)