update PD support to be compatible with nixlv2 (#258)

mayabar · web-flow · commit c5bfb7209518 · 2025-11-19T09:48:48.000Z
* update PD support to be compatible with nixlv2

Signed-off-by: Maya Barnea &lt;mayab@il.ibm.com&gt;

* fix test + add comment

Signed-off-by: Maya Barnea &lt;mayab@il.ibm.com&gt;

---------

Signed-off-by: Maya Barnea &lt;mayab@il.ibm.com&gt;
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
@@ -576,14 +576,16 @@ func (s *VllmSimulator) createCompletionResponse(logprobs *int, isChatCompletion
 		time.Now().Unix(), modelName, usageData)
 
 	if doRemoteDecode {
+		baseResp.KVParams = &openaiserverapi.KVTransferParams{}
 		// add special fields related to the prefill pod special behavior
-		baseResp.DoRemoteDecode = true
-		baseResp.DoRemotePrefill = false
+		baseResp.KVParams.DoRemoteDecode = false
+		baseResp.KVParams.DoRemotePrefill = true
 		// currently remote prefill information is hard-coded
-		baseResp.RemoteBlockIds = []string{"DUMMY_ID"}
-		baseResp.RemoteEngineId = "DUMMY_ID"
-		baseResp.RemoteHost = "DUMMY"
-		baseResp.RemotePort = 1234
+		baseResp.KVParams.RemoteBlockIds = []string{"DUMMY_ID"}
+		baseResp.KVParams.RemoteEngineId = "DUMMY_ID"
+		baseResp.KVParams.RemoteHost = "DUMMY"
+		baseResp.KVParams.RemotePort = 1234
+		baseResp.KVParams.TPSize = 1
 	}
 
 	baseChoice := openaiserverapi.CreateBaseResponseChoice(0, finishReason)
diff --git a/pkg/llm-d-inference-sim/test_utils.go b/pkg/llm-d-inference-sim/test_utils.go
@@ -19,6 +19,7 @@ import (
 	"bufio"
 	"context"
 	"crypto/tls"
+	"encoding/json"
 	"errors"
 	"fmt"
 	"io"
@@ -33,6 +34,7 @@ import (
 	"time"
 
 	"github.com/llm-d/llm-d-inference-sim/pkg/common"
+	openaiserverapi "github.com/llm-d/llm-d-inference-sim/pkg/openai-server-api"
 	"github.com/openai/openai-go/v3"
 	"github.com/openai/openai-go/v3/option"
 	"github.com/openai/openai-go/v3/packages/param"
@@ -169,14 +171,15 @@ func singleRequestLatencyTest(ttft int, prefillTimePerToken int, interTokenLaten
 func sendCompletionRequestForLatencyTest(client *http.Client, modelName string, prompt string, isStreaming bool, doRemotePrefill bool) {
 	// send completions request using http post because disagregated PD fields should be included
 	// Test with raw HTTP to verify the error response format
-	reqBody := fmt.Sprintf(`{
-				"prompt": "%s",
-				"model": "%s",
-				"stream": %t,
-				"do_remote_prefill": %t
-			}`, prompt, modelName, isStreaming, doRemotePrefill)
-
-	resp, err := client.Post("http://localhost/v1/completions", "application/json", strings.NewReader(reqBody))
+	req := &openaiserverapi.TextCompletionRequest{Prompt: prompt}
+	req.KVParams = &openaiserverapi.KVTransferParams{DoRemotePrefill: doRemotePrefill}
+	req.Model = modelName
+	req.Stream = isStreaming
+
+	body, err := json.Marshal(req)
+	gomega.Expect(err).NotTo(gomega.HaveOccurred())
+
+	resp, err := client.Post("http://localhost/v1/completions", "application/json", strings.NewReader(string(body)))
 	gomega.Expect(err).NotTo(gomega.HaveOccurred())
 	defer func() {
 		err := resp.Body.Close()
diff --git a/pkg/openai-server-api/request.go b/pkg/openai-server-api/request.go
@@ -91,22 +91,29 @@ type baseCompletionRequest struct {
 	StreamOptions StreamOptions `json:"stream_options"`
 	// Model defines Model name to use for "inference", could be base Model name or one of available LoRA adapters
 	Model string `json:"model"`
+	// KVParams kv transfer related fields
+	KVParams *KVTransferParams `json:"kv_transfer_params"`
+	// The number of tokens in the prompt that are in the local KV Cache
+	cachedPromptTokens int
+	// IgnoreEOS is a boolean value, true when the model should ignore end-of-sequence tokens
+	IgnoreEOS bool `json:"ignore_eos"`
+}
+
+type KVTransferParams struct {
 	// DoRemoteDecode boolean value, true when request's decode will be done on remote pod
 	DoRemoteDecode bool `json:"do_remote_decode"`
 	// DoRemotePrefill boolean value, true when request's prefill was done on remote pod
 	DoRemotePrefill bool `json:"do_remote_prefill"`
-	// RemoteBlockIds is a list of block identifiers to process remotely for distributed decoding
-	RemoteBlockIds []string `json:"remote_block_ids"`
 	// RemoteEngineId is an identifier of the remote inference engine or backend to use for processing requests
 	RemoteEngineId string `json:"remote_engine_id"`
+	// RemoteBlockIds is a list of block identifiers to process remotely for distributed decoding
+	RemoteBlockIds []string `json:"remote_block_ids"`
 	// RemoteHost is a hostname or IP address of the remote server handling prefill
 	RemoteHost string `json:"remote_host"`
 	// RemotePort is a port of the remote server handling prefill
 	RemotePort int `json:"remote_port"`
-	// The number of tokens in the prompt that are in the local KV Cache
-	cachedPromptTokens int
-	// IgnoreEOS is a boolean value, true when the model should ignore end-of-sequence tokens
-	IgnoreEOS bool `json:"ignore_eos"`
+	// TPSize is the tensor parallelism size for KV cache transfer
+	TPSize int `json:"tp_size" default:"1"`
 }
 
 // StreamOptions defines streaming options for streaming requests
@@ -132,11 +139,11 @@ func (b *baseCompletionRequest) IncludeUsage() bool {
 }
 
 func (b *baseCompletionRequest) IsDoRemoteDecode() bool {
-	return b.DoRemoteDecode
+	return b.KVParams != nil && b.KVParams.DoRemoteDecode
 }
 
 func (b *baseCompletionRequest) IsDoRemotePrefill() bool {
-	return b.DoRemotePrefill
+	return b.KVParams != nil && b.KVParams.DoRemotePrefill
 }
 
 // GetNumberOfCachedPromptTokens returns the number of tokens in the prompt that are
diff --git a/pkg/openai-server-api/response.go b/pkg/openai-server-api/response.go
@@ -40,18 +40,8 @@ type baseCompletionResponse struct {
 	Usage *Usage `json:"usage"`
 	// Object is the Object type, "text_completion", "chat.completion", or "chat.completion.chunk"
 	Object string `json:"object"`
-	// DoRemoteDecode boolean value, true when request's decode will be done on remote pod
-	DoRemoteDecode bool `json:"do_remote_decode"`
-	// DoRemotePrefill boolean value, true when request's prefill was done on remote pod
-	DoRemotePrefill bool `json:"do_remote_prefill"`
-	// RemoteBlockIds is a list of block identifiers to process remotely for distributed decoding
-	RemoteBlockIds []string `json:"remote_block_ids"`
-	// RemoteEngineId is an identifier of the remote inference engine or backend to use for processing requests
-	RemoteEngineId string `json:"remote_engine_id"`
-	// RemoteHost is a hostname or IP address of the remote server handling prefill
-	RemoteHost string `json:"remote_host"`
-	// RemotePort is a port of the remote server handling prefill
-	RemotePort int `json:"remote_port"`
+	// KVParams kv transfer related fields
+	KVParams *KVTransferParams `json:"kv_transfer_params"`
 }
 
 // Usage contains token Usage statistics