[BBR] perf: optimize model name extraction with selective JSON unmarshaling using struct tags (#1359)

pierDipi · web-flow · commit 159aa54750c9 · 2025-08-12T03:07:07.000-07:00
* [BBR] perf: optimize model name extraction with selective JSON unmarshaling using struct tags Implements selective JSON unmarshaling using struct tags to extract only the 'model' field instead of full JSON parsing. Performance improvements: - Small payloads (1KB): 2.1x faster (8.7 µs -> 3.8 µs), 14x less memory (5.2 KB → 368 B) - Medium payloads (25KB): 1.4x faster (26.2 µs -> 18.5 µs), 35x less memory (12.7 KB → 368 B) - Large payloads (200KB): 1.1x faster (130 µs -> 117 µs), 133x less memory (48.8 KB → 368 B) - Very large payloads (1MB): 1.1x faster (754.7 µs -> 691 µs), 730x less memory (269 KB → 368 B) The struct approach consistently uses ~368B regardless of payload size, while full unmarshaling scales linearly with input size. Memory allocations reduced from 100+ to just 10 per operation across all payload sizes. Benchmark results (3 runs, Intel Core Ultra 7 165H): ``` $ go test -count=3 -bench=BenchmarkExtractModel -benchmem ./pkg/bbr/handlers/ goos: linux goarch: amd64 pkg: sigs.k8s.io/gateway-api-inference-extension/pkg/bbr/handlers cpu: Intel(R) Core(TM) Ultra 7 165H BenchmarkExtractModel_Small_FullUnmarshal-22 125146 8658 ns/op 5224 B/op 109 allocs/op BenchmarkExtractModel_Small_FullUnmarshal-22 144015 8590 ns/op 5224 B/op 109 allocs/op BenchmarkExtractModel_Small_FullUnmarshal-22 134164 8884 ns/op 5224 B/op 109 allocs/op BenchmarkExtractModel_Small_StructUnmarshal-22 326854 3767 ns/op 368 B/op 10 allocs/op BenchmarkExtractModel_Small_StructUnmarshal-22 323583 3714 ns/op 368 B/op 10 allocs/op BenchmarkExtractModel_Small_StructUnmarshal-22 336662 3815 ns/op 368 B/op 10 allocs/op BenchmarkExtractModel_Medium_FullUnmarshal-22 44419 26842 ns/op 12744 B/op 176 allocs/op BenchmarkExtractModel_Medium_FullUnmarshal-22 42657 26118 ns/op 12744 B/op 176 allocs/op BenchmarkExtractModel_Medium_FullUnmarshal-22 43706 25536 ns/op 12744 B/op 176 allocs/op BenchmarkExtractModel_Medium_StructUnmarshal-22 64760 18513 ns/op 368 B/op 10 allocs/op BenchmarkExtractModel_Medium_StructUnmarshal-22 61837 18548 ns/op 368 B/op 10 allocs/op BenchmarkExtractModel_Medium_StructUnmarshal-22 64534 18582 ns/op 368 B/op 10 allocs/op BenchmarkExtractModel_Large_FullUnmarshal-22 9825 129805 ns/op 48824 B/op 257 allocs/op BenchmarkExtractModel_Large_FullUnmarshal-22 9357 126792 ns/op 48824 B/op 257 allocs/op BenchmarkExtractModel_Large_FullUnmarshal-22 9558 135569 ns/op 48824 B/op 257 allocs/op BenchmarkExtractModel_Large_StructUnmarshal-22 10270 116782 ns/op 368 B/op 10 allocs/op BenchmarkExtractModel_Large_StructUnmarshal-22 9770 117259 ns/op 368 B/op 10 allocs/op BenchmarkExtractModel_Large_StructUnmarshal-22 8673 117041 ns/op 368 B/op 10 allocs/op BenchmarkExtractModel_VeryLarge_FullUnmarshal-22 1530 743882 ns/op 268974 B/op 498 allocs/op BenchmarkExtractModel_VeryLarge_FullUnmarshal-22 1437 795705 ns/op 268974 B/op 498 allocs/op BenchmarkExtractModel_VeryLarge_FullUnmarshal-22 1609 724524 ns/op 268974 B/op 498 allocs/op BenchmarkExtractModel_VeryLarge_StructUnmarshal-22 1743 682454 ns/op 368 B/op 10 allocs/op BenchmarkExtractModel_VeryLarge_StructUnmarshal-22 1777 695514 ns/op 368 B/op 10 allocs/op BenchmarkExtractModel_VeryLarge_StructUnmarshal-22 1611 695033 ns/op 368 B/op 10 allocs/op BenchmarkExtractModel_ModelAtEnd_FullUnmarshal-22 3019 393604 ns/op 132442 B/op 198 allocs/op BenchmarkExtractModel_ModelAtEnd_FullUnmarshal-22 3061 398825 ns/op 132442 B/op 198 allocs/op BenchmarkExtractModel_ModelAtEnd_FullUnmarshal-22 2961 395819 ns/op 132442 B/op 198 allocs/op BenchmarkExtractModel_ModelAtEnd_StructUnmarshal-22 3091 400157 ns/op 304 B/op 9 allocs/op BenchmarkExtractModel_ModelAtEnd_StructUnmarshal-22 2908 391080 ns/op 304 B/op 9 allocs/op BenchmarkExtractModel_ModelAtEnd_StructUnmarshal-22 3067 395283 ns/op 304 B/op 9 allocs/op ``` Benchmark code: ```golang // pkg/bbr/handlers/request_bench_test.go /* Copyright 2025 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package handlers import ( "bytes" "encoding/json" "fmt" "strings" "testing" ) type ModelWrapper struct { Model any `json:"model"` } // extractModelFullUnmarshal represents the old approach - full JSON unmarshaling func extractModelFullUnmarshal(data []byte) (any, error) { var requestBody map[string]any if err := json.Unmarshal(data, &requestBody); err != nil { return nil, err } modelVal, ok := requestBody["model"] if !ok { return nil, fmt.Errorf("model field not found") } return modelVal, nil } // extractModelStructUnmarshal represents the struct approach func extractModelStructUnmarshal(data []byte) (any, error) { var requestBody ModelWrapper if err := json.Unmarshal(data, &requestBody); err != nil { return nil, err } if requestBody.Model == "" { return nil, fmt.Errorf("model field not found") } return requestBody.Model, nil } // extractModelStreaming efficiently extracts the "model" field from JSON without unmarshaling the entire payload. func extractModelStreaming(data []byte) (any, error) { decoder := json.NewDecoder(bytes.NewReader(data)) // Read opening brace token, err := decoder.Token() if err != nil { return "", err } if delim, ok := token.(json.Delim); !ok || delim != '{' { return "", fmt.Errorf("expected JSON object") } // Scan through the JSON object looking for "model" key for decoder.More() { // Read the key token, err := decoder.Token() if err != nil { return "", err } key, ok := token.(string) if !ok { return "", fmt.Errorf("expected string key") } if key == "model" { // Found the model key, read its value token, err := decoder.Token() if err != nil { return "", err } return token, nil } // Skip value _ = decoder.Decode(nil) } return "", fmt.Errorf("model field not found") } // generateOpenAIPayload creates realistic OpenAI chat completion request payloads func generateOpenAIPayload(messageCount int, avgMessageLength int) []byte { messages := make([]map[string]string, messageCount) // Create realistic conversation messages baseContent := strings.Repeat("This is a realistic message content that represents typical AI conversation data. ", avgMessageLength/100) for i := 0; i < messageCount; i++ { role := "user" if i%2 == 1 { role = "assistant" } messages[i] = map[string]string{ "role": role, "content": baseContent + strings.Repeat("x", avgMessageLength%100), } } payload := map[string]any{ "model": "gpt-4o", "messages": messages, "max_tokens": 2048, "temperature": 0.7, "top_p": 1.0, "frequency_penalty": 0.0, "presence_penalty": 0.0, "stream": false, "tools": []map[string]any{ { "type": "function", "function": map[string]any{ "name": "get_weather", "description": "Get the current weather in a given location", "parameters": map[string]any{ "type": "object", "properties": map[string]any{ "location": map[string]any{ "type": "string", "description": "The city and state, e.g. San Francisco, CA", }, "unit": map[string]any{ "type": "string", "enum": []string{"celsius", "fahrenheit"}, }, }, "required": []string{"location"}, }, }, }, }, } data, _ := json.Marshal(payload) return data } // Benchmark small payloads (typical single message) func BenchmarkExtractModel_Small_FullUnmarshal(b *testing.B) { payload := generateOpenAIPayload(2, 100) // ~1KB b.ResetTimer() b.ReportAllocs() for i := 0; i < b.N; i++ { _, err := extractModelFullUnmarshal(payload) if err != nil { b.Fatal(err) } } } // Benchmark small payloads (typical single message) func BenchmarkExtractModel_Small_StructUnmarshal(b *testing.B) { payload := generateOpenAIPayload(2, 100) // ~1KB b.ResetTimer() b.ReportAllocs() for i := 0; i < b.N; i++ { _, err := extractModelStructUnmarshal(payload) if err != nil { b.Fatal(err) } } } func BenchmarkExtractModel_Small_Streaming(b *testing.B) { payload := generateOpenAIPayload(2, 100) // ~1KB b.ResetTimer() b.ReportAllocs() for i := 0; i < b.N; i++ { _, err := extractModelStreaming(payload) if err != nil { b.Fatal(err) } } } // Benchmark medium payloads (conversation with context) func BenchmarkExtractModel_Medium_FullUnmarshal(b *testing.B) { payload := generateOpenAIPayload(10, 500) // ~25KB b.ResetTimer() b.ReportAllocs() for i := 0; i < b.N; i++ { _, err := extractModelFullUnmarshal(payload) if err != nil { b.Fatal(err) } } } // Benchmark medium payloads (conversation with context) func BenchmarkExtractModel_Medium_StructUnmarshal(b *testing.B) { payload := generateOpenAIPayload(10, 500) // ~25KB b.ResetTimer() b.ReportAllocs() for i := 0; i < b.N; i++ { _, err := extractModelStructUnmarshal(payload) if err != nil { b.Fatal(err) } } } func BenchmarkExtractModel_Medium_Streaming(b *testing.B) { payload := generateOpenAIPayload(10, 500) // ~25KB b.ResetTimer() b.ReportAllocs() for i := 0; i < b.N; i++ { _, err := extractModelStreaming(payload) if err != nil { b.Fatal(err) } } } // Benchmark large payloads (long conversation with RAG context) func BenchmarkExtractModel_Large_FullUnmarshal(b *testing.B) { payload := generateOpenAIPayload(20, 2000) // ~200KB b.ResetTimer() b.ReportAllocs() for i := 0; i < b.N; i++ { _, err := extractModelFullUnmarshal(payload) if err != nil { b.Fatal(err) } } } // Benchmark large payloads (long conversation with RAG context) func BenchmarkExtractModel_Large_StructUnmarshal(b *testing.B) { payload := generateOpenAIPayload(20, 2000) // ~200KB b.ResetTimer() b.ReportAllocs() for i := 0; i < b.N; i++ { _, err := extractModelStructUnmarshal(payload) if err != nil { b.Fatal(err) } } } func BenchmarkExtractModel_Large_Streaming(b *testing.B) { payload := generateOpenAIPayload(20, 2000) // ~200KB b.ResetTimer() b.ReportAllocs() for i := 0; i < b.N; i++ { _, err := extractModelStreaming(payload) if err != nil { b.Fatal(err) } } } // Benchmark very large payloads (extensive context/embeddings) func BenchmarkExtractModel_VeryLarge_FullUnmarshal(b *testing.B) { payload := generateOpenAIPayload(50, 5000) // ~1MB b.ResetTimer() b.ReportAllocs() for i := 0; i < b.N; i++ { _, err := extractModelFullUnmarshal(payload) if err != nil { b.Fatal(err) } } } // Benchmark very large payloads (extensive context/embeddings) func BenchmarkExtractModel_VeryLarge_StructUnmarshal(b *testing.B) { payload := generateOpenAIPayload(50, 5000) // ~1MB b.ResetTimer() b.ReportAllocs() for i := 0; i < b.N; i++ { _, err := extractModelStructUnmarshal(payload) if err != nil { b.Fatal(err) } } } func BenchmarkExtractModel_VeryLarge_Streaming(b *testing.B) { payload := generateOpenAIPayload(50, 5000) // ~1MB b.ResetTimer() b.ReportAllocs() for i := 0; i < b.N; i++ { _, err := extractModelStreaming(payload) if err != nil { b.Fatal(err) } } } // Benchmark worst case - model field at the end func BenchmarkExtractModel_ModelAtEnd_FullUnmarshal(b *testing.B) { messages := make([]map[string]string, 20) for i := 0; i < 20; i++ { messages[i] = map[string]string{ "role": "user", "content": strings.Repeat("Very long message content that makes the payload large. ", 100), } } // Put model field at the end to test worst case for streaming parser payload := map[string]any{ "messages": messages, "max_tokens": 2048, "temperature": 0.7, "top_p": 1.0, "frequency_penalty": 0.0, "presence_penalty": 0.0, "stream": false, "model": "gpt-4o", // Model at the end } data, _ := json.Marshal(payload) b.ResetTimer() b.ReportAllocs() for i := 0; i < b.N; i++ { _, err := extractModelFullUnmarshal(data) if err != nil { b.Fatal(err) } } } // Benchmark worst case - model field at the end func BenchmarkExtractModel_ModelAtEnd_StructUnmarshal(b *testing.B) { messages := make([]map[string]string, 20) for i := 0; i < 20; i++ { messages[i] = map[string]string{ "role": "user", "content": strings.Repeat("Very long message content that makes the payload large. ", 100), } } // Put model field at the end to test worst case for streaming parser payload := map[string]any{ "messages": messages, "max_tokens": 2048, "temperature": 0.7, "top_p": 1.0, "frequency_penalty": 0.0, "presence_penalty": 0.0, "stream": false, "model": "gpt-4o", // Model at the end } data, _ := json.Marshal(payload) b.ResetTimer() b.ReportAllocs() for i := 0; i < b.N; i++ { _, err := extractModelStructUnmarshal(data) if err != nil { b.Fatal(err) } } } func BenchmarkExtractModel_ModelAtEnd_Streaming(b *testing.B) { messages := make([]map[string]string, 20) for i := 0; i < 20; i++ { messages[i] = map[string]string{ "role": "user", "content": strings.Repeat("Very long message content that makes the payload large. ", 100), } } // Put model field at the end to test worst case for streaming parser payload := map[string]any{ "messages": messages, "max_tokens": 2048, "temperature": 0.7, "top_p": 1.0, "frequency_penalty": 0.0, "presence_penalty": 0.0, "stream": false, "model": "gpt-4o", // Model at the end } data, _ := json.Marshal(payload) b.ResetTimer() b.ReportAllocs() for i := 0; i < b.N; i++ { _, err := extractModelStreaming(data) if err != nil { b.Fatal(err) } } } ``` Signed-off-by: Pierangelo Di Pilato <pierdipi@redhat.com> * Force model to be a string Signed-off-by: Pierangelo Di Pilato <pierdipi@redhat.com> --------- Signed-off-by: Pierangelo Di Pilato <pierdipi@redhat.com>
diff --git a/pkg/bbr/handlers/request.go b/pkg/bbr/handlers/request.go
@@ -19,7 +19,6 @@ package handlers
 import (
 	"context"
 	"encoding/json"
-	"fmt"
 
 	basepb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3"
 	eppb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3"
@@ -31,18 +30,22 @@ import (
 
 const modelHeader = "X-Gateway-Model-Name"
 
+type RequestBody struct {
+	Model string `json:"model"`
+}
+
 // HandleRequestBody handles request bodies.
 func (s *Server) HandleRequestBody(ctx context.Context, requestBodyBytes []byte) ([]*eppb.ProcessingResponse, error) {
 	logger := log.FromContext(ctx)
 	var ret []*eppb.ProcessingResponse
 
-	var requestBody map[string]any
+	var requestBody RequestBody
 	if err := json.Unmarshal(requestBodyBytes, &requestBody); err != nil {
+		metrics.RecordModelNotParsedCounter()
 		return nil, err
 	}
 
-	modelVal, ok := requestBody["model"]
-	if !ok {
+	if requestBody.Model == "" {
 		metrics.RecordModelNotInBodyCounter()
 		logger.V(logutil.DEFAULT).Info("Request body does not contain model parameter")
 		if s.streaming {
@@ -63,13 +66,6 @@ func (s *Server) HandleRequestBody(ctx context.Context, requestBodyBytes []byte)
 		return ret, nil
 	}
 
-	modelStr, ok := modelVal.(string)
-	if !ok {
-		metrics.RecordModelNotParsedCounter()
-		logger.V(logutil.DEFAULT).Info("Model parameter value is not a string")
-		return nil, fmt.Errorf("the model parameter value %v is not a string", modelVal)
-	}
-
 	metrics.RecordSuccessCounter()
 
 	if s.streaming {
@@ -83,7 +79,7 @@ func (s *Server) HandleRequestBody(ctx context.Context, requestBodyBytes []byte)
 								{
 									Header: &basepb.HeaderValue{
 										Key:      modelHeader,
-										RawValue: []byte(modelStr),
+										RawValue: []byte(requestBody.Model),
 									},
 								},
 							},
@@ -108,7 +104,7 @@ func (s *Server) HandleRequestBody(ctx context.Context, requestBodyBytes []byte)
 								{
 									Header: &basepb.HeaderValue{
 										Key:      modelHeader,
-										RawValue: []byte(modelStr),
+										RawValue: []byte(requestBody.Model),
 									},
 								},
 							},