Merge pull request #1391 from krissetto/better-dmr-support

krissetto · web-flow · commit 7beb6788b29a · 2026-01-23T11:47:38.000+01:00
Better DMR support
diff --git a/docs/PROVIDERS.md b/docs/PROVIDERS.md
@@ -134,6 +134,6 @@ The DMR provider supports speculative decoding for faster inference. Configure i
 - `speculative_num_tokens` (int): Number of tokens to generate speculatively
 - `speculative_acceptance_rate` (float): Acceptance rate threshold for speculative tokens
 
-All three options are passed to `docker model configure` as command-line flags.
+All three options are sent to Model Runner via its internal `POST /engines/_configure` API endpoint.
 
 You can also pass any flag of the underlying model runtime (llama.cpp or vllm) using the `runtime_flags` option
diff --git a/docs/USAGE.md b/docs/USAGE.md
@@ -722,12 +722,12 @@ models:
       speculative_acceptance_rate: 0.8         # Acceptance rate threshold
 ```
 
-All three speculative decoding options are passed to `docker model configure` as flags:
-- `speculative_draft_model` → `--speculative-draft-model`
-- `speculative_num_tokens` → `--speculative-num-tokens`
-- `speculative_acceptance_rate` → `--speculative-acceptance-rate`
+All three speculative decoding options are sent to Model Runner via its internal `POST /engines/_configure` API endpoint:
+- `speculative_draft_model` → `speculative.draft_model`
+- `speculative_num_tokens` → `speculative.num_tokens`
+- `speculative_acceptance_rate` → `speculative.min_acceptance_rate`
 
-These options work alongside `max_tokens` (which sets `--context-size`) and `runtime_flags`.
+These options work alongside `max_tokens` (which sets `context-size`) and `runtime_flags`.
 
 ##### Troubleshooting:
 
diff --git a/pkg/fake/proxy.go b/pkg/fake/proxy.go
@@ -206,10 +206,14 @@ func RemoveHeadersHook(i *cassette.Interaction) error {
 	return nil
 }
 
-// DefaultMatcher creates a matcher that normalizes tool call IDs for consistent matching.
+// DefaultMatcher creates a matcher that normalizes dynamic fields for consistent matching.
 // The onError callback is called if reading the request body fails (nil logs and returns false).
 func DefaultMatcher(onError func(err error)) recorder.MatcherFunc {
+	// Normalize tool call IDs (they change between requests)
 	callIDRegex := regexp.MustCompile(`call_[a-z0-9\-]+`)
+	// Normalize max_tokens/max_output_tokens/maxOutputTokens field (varies based on models.dev
+	// cache state and provider cloning behavior). Handles both snake_case and camelCase variants.
+	maxTokensRegex := regexp.MustCompile(`"(?:max_(?:output_)?tokens|maxOutputTokens)":\d+,?`)
 
 	return func(r *http.Request, i cassette.Request) bool {
 		if r.Body == nil || r.Body == http.NoBody {
@@ -234,8 +238,13 @@ func DefaultMatcher(onError func(err error)) recorder.MatcherFunc {
 		r.Body.Close()
 		r.Body = io.NopCloser(bytes.NewBuffer(reqBody))
 
-		// Normalize tool call IDs for matching
-		return callIDRegex.ReplaceAllString(string(reqBody), "call_ID") == callIDRegex.ReplaceAllString(i.Body, "call_ID")
+		// Normalize dynamic fields for matching
+		normalizedReq := callIDRegex.ReplaceAllString(string(reqBody), "call_ID")
+		normalizedReq = maxTokensRegex.ReplaceAllString(normalizedReq, "")
+		normalizedCassette := callIDRegex.ReplaceAllString(i.Body, "call_ID")
+		normalizedCassette = maxTokensRegex.ReplaceAllString(normalizedCassette, "")
+
+		return normalizedReq == normalizedCassette
 	}
 }
 
diff --git a/pkg/model/provider/clone.go b/pkg/model/provider/clone.go
@@ -19,12 +19,14 @@ func CloneWithOptions(ctx context.Context, base Provider, opts ...options.Opt) P
 
 	// Apply max_tokens override if present in options
 	// We need to apply it to the ModelConfig itself since that's what providers use
+	// Only update MaxTokens if an option explicitly sets it (non-zero value)
 	modelConfig := config.ModelConfig
 	for _, opt := range mergedOpts {
 		tempOpts := &options.ModelOptions{}
 		opt(tempOpts)
-		mt := tempOpts.MaxTokens()
-		modelConfig.MaxTokens = &mt
+		if mt := tempOpts.MaxTokens(); mt != 0 {
+			modelConfig.MaxTokens = &mt
+		}
 	}
 
 	// Use NewWithModels to support cloning routers that reference other models.
diff --git a/pkg/model/provider/clone_test.go b/pkg/model/provider/clone_test.go
@@ -137,3 +137,85 @@ func TestCloneWithOptions_DirectProvider(t *testing.T) {
 	assert.Nil(t, clonedConfig.ModelConfig.ThinkingBudget,
 		"ThinkingBudget should be nil after cloning with WithThinking(false)")
 }
+
+func TestCloneWithOptions_PreservesMaxTokens(t *testing.T) {
+	t.Parallel()
+
+	// This test verifies that max_tokens is preserved when cloning a provider
+	// with options that don't explicitly set max_tokens. Previously, options
+	// that didn't set max_tokens would accidentally clear it to 0.
+
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+		w.Header().Set("Content-Type", "text/event-stream")
+		_, _ = w.Write([]byte("data: {\"choices\":[{\"delta\":{\"content\":\"hi\"}}]}\n\n"))
+		_, _ = w.Write([]byte("data: [DONE]\n\n"))
+	}))
+	defer server.Close()
+
+	maxTokens := int64(8192)
+	cfg := &latest.ModelConfig{
+		Provider:  "openai",
+		Model:     "gpt-4o",
+		BaseURL:   server.URL,
+		MaxTokens: &maxTokens,
+	}
+
+	env := newCloneTestEnv(map[string]string{
+		"OPENAI_API_KEY": "test-key",
+	})
+
+	provider, err := New(t.Context(), cfg, env, options.WithMaxTokens(maxTokens))
+	require.NoError(t, err)
+
+	// Clone with an option that doesn't affect max_tokens (e.g., WithThinking)
+	cloned := CloneWithOptions(t.Context(), provider, options.WithThinking(false))
+
+	clonedConfig := cloned.BaseConfig()
+
+	// MaxTokens should be preserved, not cleared to 0 or nil
+	require.NotNil(t, clonedConfig.ModelConfig.MaxTokens,
+		"MaxTokens should be preserved after cloning with unrelated options")
+	assert.Equal(t, maxTokens, *clonedConfig.ModelConfig.MaxTokens,
+		"MaxTokens value should be unchanged after cloning")
+}
+
+func TestCloneWithOptions_OverridesMaxTokens(t *testing.T) {
+	t.Parallel()
+
+	// This test verifies that max_tokens can be explicitly overridden when cloning.
+
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+		w.Header().Set("Content-Type", "text/event-stream")
+		_, _ = w.Write([]byte("data: {\"choices\":[{\"delta\":{\"content\":\"hi\"}}]}\n\n"))
+		_, _ = w.Write([]byte("data: [DONE]\n\n"))
+	}))
+	defer server.Close()
+
+	originalMaxTokens := int64(8192)
+	newMaxTokens := int64(4096)
+
+	cfg := &latest.ModelConfig{
+		Provider:  "openai",
+		Model:     "gpt-4o",
+		BaseURL:   server.URL,
+		MaxTokens: &originalMaxTokens,
+	}
+
+	env := newCloneTestEnv(map[string]string{
+		"OPENAI_API_KEY": "test-key",
+	})
+
+	provider, err := New(t.Context(), cfg, env, options.WithMaxTokens(originalMaxTokens))
+	require.NoError(t, err)
+
+	// Clone with an explicit max_tokens override
+	cloned := CloneWithOptions(t.Context(), provider, options.WithMaxTokens(newMaxTokens))
+
+	clonedConfig := cloned.BaseConfig()
+
+	// MaxTokens should be updated to the new value
+	require.NotNil(t, clonedConfig.ModelConfig.MaxTokens,
+		"MaxTokens should not be nil after cloning with explicit override")
+	assert.Equal(t, newMaxTokens, *clonedConfig.ModelConfig.MaxTokens,
+		"MaxTokens should be updated to the new value")
+}
diff --git a/pkg/model/provider/dmr/client.go b/pkg/model/provider/dmr/client.go
diff --git a/pkg/model/provider/dmr/client_test.go b/pkg/model/provider/dmr/client_test.go