Merge branch 'main' into fix/stream-mode

rootfs · web-flow · commit d85c966f9ba8 · 2025-09-29T07:45:40.000-05:00
diff --git a/.github/workflows/test-and-build.yml b/.github/workflows/test-and-build.yml
@@ -72,7 +72,7 @@ jobs:
 
       - name: Install HuggingFace CLI
         run: |
-          pip install -U "huggingface_hub[cli]"
+          pip install -U "huggingface_hub[cli]" hf_transfer
 
 
       - name: Download models (minimal on PRs)
diff --git a/src/semantic-router/go.mod b/src/semantic-router/go.mod
@@ -20,6 +20,7 @@ require (
 	github.com/openai/openai-go v1.12.0
 	github.com/prometheus/client_golang v1.23.0
 	github.com/prometheus/client_model v0.6.2
+	github.com/stretchr/testify v1.10.0
 	github.com/vllm-project/semantic-router/candle-binding v0.0.0-00010101000000-000000000000
 	go.uber.org/zap v1.27.0
 	google.golang.org/grpc v1.71.1
@@ -34,6 +35,7 @@ require (
 	github.com/cockroachdb/errors v1.9.1 // indirect
 	github.com/cockroachdb/logtags v0.0.0-20211118104740-dabe8e521a4f // indirect
 	github.com/cockroachdb/redact v1.1.3 // indirect
+	github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
 	github.com/envoyproxy/protoc-gen-validate v1.2.1 // indirect
 	github.com/fxamacker/cbor/v2 v2.7.0 // indirect
 	github.com/getsentry/sentry-go v0.12.0 // indirect
@@ -54,6 +56,7 @@ require (
 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
 	github.com/pkg/errors v0.9.1 // indirect
 	github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect
+	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
 	github.com/prometheus/common v0.65.0 // indirect
 	github.com/prometheus/procfs v0.16.1 // indirect
 	github.com/rogpeppe/go-internal v1.12.0 // indirect
diff --git a/src/semantic-router/go.sum b/src/semantic-router/go.sum
@@ -257,8 +257,8 @@ github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An
 github.com/spf13/viper v1.3.2/go.mod h1:ZiWeW+zYFKm7srdB9IoDzzZXaJaI5eL9QjNiN/DMA2s=
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
-github.com/stretchr/objx v0.5.0 h1:1zr/of2m5FGMsad5YfcqgdqdWrIhu+EBEJRhR1U7z/c=
-github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
+github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY=
+github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
 github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
 github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
 github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
diff --git a/src/semantic-router/pkg/extproc/request_handler.go b/src/semantic-router/pkg/extproc/request_handler.go
@@ -208,6 +208,12 @@ type RequestContext struct {
 	// TTFT tracking
 	TTFTRecorded bool
 	TTFTSeconds  float64
+
+	// VSR decision tracking
+	VSRSelectedCategory string // The category selected by VSR
+	VSRReasoningMode    string // "on" or "off" - whether reasoning mode was determined to be used
+	VSRSelectedModel    string // The model selected by VSR
+	VSRCacheHit         bool   // Whether this request hit the cache
 }
 
 // handleRequestHeaders processes the request headers
@@ -379,6 +385,8 @@ func (r *OpenAIRouter) handleCaching(ctx *RequestContext) (*ext_proc.ProcessingR
 		if err != nil {
 			observability.Errorf("Error searching cache: %v", err)
 		} else if found {
+			// Mark this request as a cache hit
+			ctx.VSRCacheHit = true
 			// Log cache hit
 			observability.LogEvent("cache_hit", map[string]interface{}{
 				"request_id": ctx.RequestID,
@@ -389,13 +397,13 @@ func (r *OpenAIRouter) handleCaching(ctx *RequestContext) (*ext_proc.ProcessingR
 			response := http.CreateCacheHitResponse(cachedResponse)
 			return response, true
 		}
+	}
 
-		// Cache miss, store the request for later
-		err = r.Cache.AddPendingRequest(ctx.RequestID, requestModel, requestQuery, ctx.OriginalRequestBody)
-		if err != nil {
-			observability.Errorf("Error adding pending request to cache: %v", err)
-			// Continue without caching
-		}
+	// Cache miss, store the request for later
+	err = r.Cache.AddPendingRequest(ctx.RequestID, requestModel, requestQuery, ctx.OriginalRequestBody)
+	if err != nil {
+		observability.Errorf("Error adding pending request to cache: %v", err)
+		// Continue without caching
 	}
 
 	return nil, false
@@ -499,6 +507,15 @@ func (r *OpenAIRouter) handleModelRouting(openAIRequest *openai.ChatCompletionNe
 				effortForMetrics := r.getReasoningEffort(categoryName)
 				metrics.RecordReasoningDecision(categoryName, matchedModel, useReasoning, effortForMetrics)
 
+				// Track VSR decision information
+				ctx.VSRSelectedCategory = categoryName
+				ctx.VSRSelectedModel = matchedModel
+				if useReasoning {
+					ctx.VSRReasoningMode = "on"
+				} else {
+					ctx.VSRReasoningMode = "off"
+				}
+
 				// Track the model routing change
 				metrics.RecordModelRouting(originalModel, matchedModel)
 
@@ -612,6 +629,9 @@ func (r *OpenAIRouter) handleModelRouting(openAIRequest *openai.ChatCompletionNe
 		}
 	} else if originalModel != "auto" {
 		observability.Infof("Using specified model: %s", originalModel)
+		// Track VSR decision information for non-auto models
+		ctx.VSRSelectedModel = originalModel
+		ctx.VSRReasoningMode = "off" // Non-auto models don't use reasoning mode by default
 		// For non-auto models, check PII policy compliance
 		allContent := pii.ExtractAllContent(userContent, nonUserMessages)
 		detectedPII := r.Classifier.DetectPIIInContent(allContent)
diff --git a/src/semantic-router/pkg/extproc/response_handler.go b/src/semantic-router/pkg/extproc/response_handler.go
@@ -17,12 +17,18 @@ import (
 
 // handleResponseHeaders processes the response headers
 func (r *OpenAIRouter) handleResponseHeaders(v *ext_proc.ProcessingRequest_ResponseHeaders, ctx *RequestContext) (*ext_proc.ProcessingResponse, error) {
+	var statusCode int
+	var isSuccessful bool
+
 	// Detect upstream HTTP status and record non-2xx as errors
 	if v != nil && v.ResponseHeaders != nil && v.ResponseHeaders.Headers != nil {
 		// Determine if the response is streaming based on Content-Type
 		ctx.IsStreamingResponse = isStreamingContentType(v.ResponseHeaders.Headers)
 
-		if statusCode := getStatusFromHeaders(v.ResponseHeaders.Headers); statusCode != 0 {
+		statusCode = getStatusFromHeaders(v.ResponseHeaders.Headers)
+		isSuccessful = statusCode >= 200 && statusCode < 300
+
+		if statusCode != 0 {
 			if statusCode >= 500 {
 				metrics.RecordRequestError(getModelFromCtx(ctx), "upstream_5xx")
 			} else if statusCode >= 400 {
@@ -43,12 +49,58 @@ func (r *OpenAIRouter) handleResponseHeaders(v *ext_proc.ProcessingRequest_Respo
 		}
 	}
 
-	// Allow the response to continue without modification
+	// Prepare response headers with VSR decision tracking headers if applicable
+	var headerMutation *ext_proc.HeaderMutation
+
+	// Add VSR decision headers if request was successful and didn't hit cache
+	if isSuccessful && !ctx.VSRCacheHit && ctx != nil {
+		var setHeaders []*core.HeaderValueOption
+
+		// Add x-vsr-selected-category header
+		if ctx.VSRSelectedCategory != "" {
+			setHeaders = append(setHeaders, &core.HeaderValueOption{
+				Header: &core.HeaderValue{
+					Key:      "x-vsr-selected-category",
+					RawValue: []byte(ctx.VSRSelectedCategory),
+				},
+			})
+		}
+
+		// Add x-vsr-selected-reasoning header
+		if ctx.VSRReasoningMode != "" {
+			setHeaders = append(setHeaders, &core.HeaderValueOption{
+				Header: &core.HeaderValue{
+					Key:      "x-vsr-selected-reasoning",
+					RawValue: []byte(ctx.VSRReasoningMode),
+				},
+			})
+		}
+
+		// Add x-vsr-selected-model header
+		if ctx.VSRSelectedModel != "" {
+			setHeaders = append(setHeaders, &core.HeaderValueOption{
+				Header: &core.HeaderValue{
+					Key:      "x-vsr-selected-model",
+					RawValue: []byte(ctx.VSRSelectedModel),
+				},
+			})
+		}
+
+		// Create header mutation if we have headers to add
+		if len(setHeaders) > 0 {
+			headerMutation = &ext_proc.HeaderMutation{
+				SetHeaders: setHeaders,
+			}
+		}
+	}
+
+	// Allow the response to continue with VSR headers if applicable
 	response := &ext_proc.ProcessingResponse{
 		Response: &ext_proc.ProcessingResponse_ResponseHeaders{
 			ResponseHeaders: &ext_proc.HeadersResponse{
 				Response: &ext_proc.CommonResponse{
-					Status: ext_proc.CommonResponse_CONTINUE,
+					Status:         ext_proc.CommonResponse_CONTINUE,
+					HeaderMutation: headerMutation,
 				},
 			},
 		},
diff --git a/src/semantic-router/pkg/extproc/vsr_headers_test.go b/src/semantic-router/pkg/extproc/vsr_headers_test.go
@@ -0,0 +1,183 @@
+package extproc
+
+import (
+	"testing"
+
+	core "github.com/envoyproxy/go-control-plane/envoy/config/core/v3"
+	ext_proc "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3"
+	"github.com/stretchr/testify/assert"
+)
+
+func TestVSRHeadersAddedOnSuccessfulNonCachedResponse(t *testing.T) {
+	// Create a mock router
+	router := &OpenAIRouter{}
+
+	// Create request context with VSR decision information
+	ctx := &RequestContext{
+		VSRSelectedCategory: "math",
+		VSRReasoningMode:    "on",
+		VSRSelectedModel:    "deepseek-v31",
+		VSRCacheHit:         false, // Not a cache hit
+	}
+
+	// Create response headers with successful status (200)
+	responseHeaders := &ext_proc.ProcessingRequest_ResponseHeaders{
+		ResponseHeaders: &ext_proc.HttpHeaders{
+			Headers: &core.HeaderMap{
+				Headers: []*core.HeaderValue{
+					{Key: ":status", Value: "200"},
+					{Key: "content-type", Value: "application/json"},
+				},
+			},
+		},
+	}
+
+	// Call handleResponseHeaders
+	response, err := router.handleResponseHeaders(responseHeaders, ctx)
+
+	// Verify no error occurred
+	assert.NoError(t, err)
+	assert.NotNil(t, response)
+
+	// Verify response structure
+	assert.NotNil(t, response.GetResponseHeaders())
+	assert.NotNil(t, response.GetResponseHeaders().GetResponse())
+
+	// Verify VSR headers were added
+	headerMutation := response.GetResponseHeaders().GetResponse().GetHeaderMutation()
+	assert.NotNil(t, headerMutation, "HeaderMutation should not be nil for successful non-cached response")
+
+	setHeaders := headerMutation.GetSetHeaders()
+	assert.Len(t, setHeaders, 3, "Should have 3 VSR headers")
+
+	// Verify each header
+	headerMap := make(map[string]string)
+	for _, header := range setHeaders {
+		headerMap[header.Header.Key] = string(header.Header.RawValue)
+	}
+
+	assert.Equal(t, "math", headerMap["x-vsr-selected-category"])
+	assert.Equal(t, "on", headerMap["x-vsr-selected-reasoning"])
+	assert.Equal(t, "deepseek-v31", headerMap["x-vsr-selected-model"])
+}
+
+func TestVSRHeadersNotAddedOnCacheHit(t *testing.T) {
+	// Create a mock router
+	router := &OpenAIRouter{}
+
+	// Create request context with cache hit
+	ctx := &RequestContext{
+		VSRSelectedCategory: "math",
+		VSRReasoningMode:    "on",
+		VSRSelectedModel:    "deepseek-v31",
+		VSRCacheHit:         true, // Cache hit - headers should not be added
+	}
+
+	// Create response headers with successful status (200)
+	responseHeaders := &ext_proc.ProcessingRequest_ResponseHeaders{
+		ResponseHeaders: &ext_proc.HttpHeaders{
+			Headers: &core.HeaderMap{
+				Headers: []*core.HeaderValue{
+					{Key: ":status", Value: "200"},
+					{Key: "content-type", Value: "application/json"},
+				},
+			},
+		},
+	}
+
+	// Call handleResponseHeaders
+	response, err := router.handleResponseHeaders(responseHeaders, ctx)
+
+	// Verify no error occurred
+	assert.NoError(t, err)
+	assert.NotNil(t, response)
+
+	// Verify VSR headers were NOT added due to cache hit
+	headerMutation := response.GetResponseHeaders().GetResponse().GetHeaderMutation()
+	assert.Nil(t, headerMutation, "HeaderMutation should be nil for cache hit")
+}
+
+func TestVSRHeadersNotAddedOnErrorResponse(t *testing.T) {
+	// Create a mock router
+	router := &OpenAIRouter{}
+
+	// Create request context with VSR decision information
+	ctx := &RequestContext{
+		VSRSelectedCategory: "math",
+		VSRReasoningMode:    "on",
+		VSRSelectedModel:    "deepseek-v31",
+		VSRCacheHit:         false, // Not a cache hit
+	}
+
+	// Create response headers with error status (500)
+	responseHeaders := &ext_proc.ProcessingRequest_ResponseHeaders{
+		ResponseHeaders: &ext_proc.HttpHeaders{
+			Headers: &core.HeaderMap{
+				Headers: []*core.HeaderValue{
+					{Key: ":status", Value: "500"},
+					{Key: "content-type", Value: "application/json"},
+				},
+			},
+		},
+	}
+
+	// Call handleResponseHeaders
+	response, err := router.handleResponseHeaders(responseHeaders, ctx)
+
+	// Verify no error occurred
+	assert.NoError(t, err)
+	assert.NotNil(t, response)
+
+	// Verify VSR headers were NOT added due to error status
+	headerMutation := response.GetResponseHeaders().GetResponse().GetHeaderMutation()
+	assert.Nil(t, headerMutation, "HeaderMutation should be nil for error response")
+}
+
+func TestVSRHeadersPartialInformation(t *testing.T) {
+	// Create a mock router
+	router := &OpenAIRouter{}
+
+	// Create request context with partial VSR information
+	ctx := &RequestContext{
+		VSRSelectedCategory: "math",
+		VSRReasoningMode:    "", // Empty reasoning mode
+		VSRSelectedModel:    "deepseek-v31",
+		VSRCacheHit:         false,
+	}
+
+	// Create response headers with successful status (200)
+	responseHeaders := &ext_proc.ProcessingRequest_ResponseHeaders{
+		ResponseHeaders: &ext_proc.HttpHeaders{
+			Headers: &core.HeaderMap{
+				Headers: []*core.HeaderValue{
+					{Key: ":status", Value: "200"},
+					{Key: "content-type", Value: "application/json"},
+				},
+			},
+		},
+	}
+
+	// Call handleResponseHeaders
+	response, err := router.handleResponseHeaders(responseHeaders, ctx)
+
+	// Verify no error occurred
+	assert.NoError(t, err)
+	assert.NotNil(t, response)
+
+	// Verify only non-empty headers were added
+	headerMutation := response.GetResponseHeaders().GetResponse().GetHeaderMutation()
+	assert.NotNil(t, headerMutation)
+
+	setHeaders := headerMutation.GetSetHeaders()
+	assert.Len(t, setHeaders, 2, "Should have 2 VSR headers (excluding empty reasoning mode)")
+
+	// Verify each header
+	headerMap := make(map[string]string)
+	for _, header := range setHeaders {
+		headerMap[header.Header.Key] = string(header.Header.RawValue)
+	}
+
+	assert.Equal(t, "math", headerMap["x-vsr-selected-category"])
+	assert.Equal(t, "deepseek-v31", headerMap["x-vsr-selected-model"])
+	assert.NotContains(t, headerMap, "x-vsr-selected-reasoning", "Empty reasoning mode should not be added")
+}
diff --git a/src/semantic-router/pkg/utils/http/response.go b/src/semantic-router/pkg/utils/http/response.go
@@ -169,7 +169,7 @@ func CreateCacheHitResponse(cachedResponse []byte) *ext_proc.ProcessingResponse
 				},
 				{
 					Header: &core.HeaderValue{
-						Key:      "x-cache-hit",
+						Key:      "x-vsr-cache-hit",
 						RawValue: []byte("true"),
 					},
 				},
diff --git a/website/docs/installation/installation.md b/website/docs/installation/installation.md
diff --git a/website/docs/troubleshooting/vsr-headers.md b/website/docs/troubleshooting/vsr-headers.md

Original file line number	Diff line number	Diff line change
`@@ -169,7 +169,7 @@ func CreateCacheHitResponse(cachedResponse []byte) *ext_proc.ProcessingResponse`
`169`	`169`	`},`
`170`	`170`	`{`
`171`	`171`	`Header: &core.HeaderValue{`
`172`		`- Key: "x-cache-hit",`
	`172`	`+ Key: "x-vsr-cache-hit",`
`173`	`173`	`RawValue: []byte("true"),`
`174`	`174`	`},`
`175`	`175`	`},`