Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ tools:
fallback_to_empty: true

prompt_guard:
enabled: true
enabled: true # Global default - can be overridden per category with jailbreak_enabled
use_modernbert: true
model_id: "models/jailbreak_classifier_modernbert-base_model"
threshold: 0.7
Expand Down Expand Up @@ -62,6 +62,7 @@ classifier:
categories:
- name: business
system_prompt: "You are a senior business consultant and strategic advisor with expertise in corporate strategy, operations management, financial analysis, marketing, and organizational development. Provide practical, actionable business advice backed by proven methodologies and industry best practices. Consider market dynamics, competitive landscape, and stakeholder interests in your recommendations."
# jailbreak_enabled: true # Optional: Override global jailbreak detection per category
model_scores:
- model: qwen3
score: 0.7
Expand Down
111 changes: 111 additions & 0 deletions config/examples/jailbreak_category_example.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# Category-Level Jailbreak Detection Example
# This example demonstrates how to configure jailbreak detection at the category level
# Different categories can have different jailbreak detection settings based on their risk profiles

# Global jailbreak detection configuration (can be overridden per category)
prompt_guard:
enabled: true # Global default - can be overridden per category
use_modernbert: true
model_id: "models/jailbreak_classifier_modernbert-base_model"
threshold: 0.7
use_cpu: true
jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"

# Classifier configuration
classifier:
category_model:
model_id: "models/category_classifier_modernbert-base_model"
use_modernbert: true
threshold: 0.6
use_cpu: true
category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json"

# Categories with different jailbreak detection settings
categories:
# High-security category: Enable jailbreak detection
- name: business
description: "Business queries, strategy, and professional advice"
jailbreak_enabled: true # Explicitly enable (inherits from global by default)
system_prompt: "You are a professional business consultant. Provide practical, actionable business advice."
model_scores:
- model: qwen3
score: 0.7
use_reasoning: false

# Public-facing category: Enable jailbreak detection
- name: customer_support
description: "Customer support and general inquiries"
jailbreak_enabled: true # Explicitly enable for customer-facing content
system_prompt: "You are a friendly customer support agent. Help users with their questions."
model_scores:
- model: qwen3
score: 0.8
use_reasoning: false

# Internal tool category: Disable jailbreak detection (trusted environment)
- name: code_generation
description: "Internal code generation and development tools"
jailbreak_enabled: false # Disable for internal developer tools
system_prompt: "You are a code generation assistant for internal developers."
model_scores:
- model: qwen3
score: 0.9
use_reasoning: true

# Testing category: Disable jailbreak detection
- name: testing
description: "Testing and quality assurance queries"
jailbreak_enabled: false # Disable for testing purposes
system_prompt: "You are a QA assistant helping with test scenarios."
model_scores:
- model: qwen3
score: 0.6
use_reasoning: false

# Default category: Uses global setting (inherits prompt_guard.enabled)
- name: general
description: "General queries that don't fit into specific categories"
# jailbreak_enabled not specified - will inherit from global prompt_guard.enabled
system_prompt: "You are a helpful assistant."
model_scores:
- model: qwen3
score: 0.5
use_reasoning: false

# Model configuration
model_config:
"qwen3":
reasoning_family: "qwen3"
preferred_endpoints: ["endpoint1"]
pii_policy:
allow_by_default: true

# Reasoning family configurations
reasoning_families:
qwen3:
type: "chat_template_kwargs"
parameter: "thinking"

# Default model for fallback
default_model: qwen3

# vLLM endpoints configuration
vllm_endpoints:
- name: "endpoint1"
address: "127.0.0.1"
port: 8000
weight: 1

# Usage Notes:
# =============
# 1. Global Setting (prompt_guard.enabled): Sets the default for all categories
# 2. Category Override (jailbreak_enabled): Override global setting per category
# 3. Inheritance: If jailbreak_enabled is not specified, inherits from prompt_guard.enabled
# 4. Use Cases:
# - Set jailbreak_enabled: true for high-security, public-facing categories
# - Set jailbreak_enabled: false for internal tools or trusted environments
# - Omit jailbreak_enabled to use the global default
# 5. Security Best Practices:
# - Enable jailbreak detection by default (prompt_guard.enabled: true)
# - Only disable for specific categories where the risk is managed differently
# - Consider the consequences of disabling protection on a per-category basis
14 changes: 14 additions & 0 deletions src/semantic-router/pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,9 @@ type Category struct {
// SemanticCacheSimilarityThreshold defines the minimum similarity score for cache hits (0.0-1.0)
// If nil, uses the global threshold from SemanticCache.SimilarityThreshold or BertModel.Threshold
SemanticCacheSimilarityThreshold *float32 `yaml:"semantic_cache_similarity_threshold,omitempty"`
// JailbreakEnabled controls whether jailbreak detection is enabled for this category
// If nil, inherits from global PromptGuard.Enabled setting
JailbreakEnabled *bool `yaml:"jailbreak_enabled,omitempty"`
}

// GetModelReasoningFamily returns the reasoning family configuration for a given model name
Expand Down Expand Up @@ -815,3 +818,14 @@ func (c *RouterConfig) GetCacheSimilarityThresholdForCategory(categoryName strin
// Fall back to global cache threshold or bert threshold
return c.GetCacheSimilarityThreshold()
}

// IsJailbreakEnabledForCategory returns whether jailbreak detection is enabled for a specific category
// If the category has an explicit setting, it takes precedence; otherwise, uses global setting
func (c *RouterConfig) IsJailbreakEnabledForCategory(categoryName string) bool {
category := c.GetCategoryByName(categoryName)
if category != nil && category.JailbreakEnabled != nil {
return *category.JailbreakEnabled
}
// Fall back to global setting
return c.PromptGuard.Enabled
}
119 changes: 119 additions & 0 deletions src/semantic-router/pkg/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1910,4 +1910,123 @@ categories:
})
})
})

Describe("IsJailbreakEnabledForCategory", func() {
Context("when global jailbreak is enabled", func() {
It("should return true for category without explicit setting", func() {
category := config.Category{
Name: "test",
ModelScores: []config.ModelScore{{Model: "test", Score: 1.0}},
}

cfg := &config.RouterConfig{
PromptGuard: config.PromptGuardConfig{
Enabled: true,
},
Categories: []config.Category{category},
}

Expect(cfg.IsJailbreakEnabledForCategory("test")).To(BeTrue())
})

It("should return false when category explicitly disables jailbreak", func() {
category := config.Category{
Name: "test",
JailbreakEnabled: config.BoolPtr(false),
ModelScores: []config.ModelScore{{Model: "test", Score: 1.0}},
}

cfg := &config.RouterConfig{
PromptGuard: config.PromptGuardConfig{
Enabled: true,
},
Categories: []config.Category{category},
}

Expect(cfg.IsJailbreakEnabledForCategory("test")).To(BeFalse())
})

It("should return true when category explicitly enables jailbreak", func() {
category := config.Category{
Name: "test",
JailbreakEnabled: config.BoolPtr(true),
ModelScores: []config.ModelScore{{Model: "test", Score: 1.0}},
}

cfg := &config.RouterConfig{
PromptGuard: config.PromptGuardConfig{
Enabled: true,
},
Categories: []config.Category{category},
}

Expect(cfg.IsJailbreakEnabledForCategory("test")).To(BeTrue())
})
})

Context("when global jailbreak is disabled", func() {
It("should return false for category without explicit setting", func() {
category := config.Category{
Name: "test",
ModelScores: []config.ModelScore{{Model: "test", Score: 1.0}},
}

cfg := &config.RouterConfig{
PromptGuard: config.PromptGuardConfig{
Enabled: false,
},
Categories: []config.Category{category},
}

Expect(cfg.IsJailbreakEnabledForCategory("test")).To(BeFalse())
})

It("should return true when category explicitly enables jailbreak", func() {
category := config.Category{
Name: "test",
JailbreakEnabled: config.BoolPtr(true),
ModelScores: []config.ModelScore{{Model: "test", Score: 1.0}},
}

cfg := &config.RouterConfig{
PromptGuard: config.PromptGuardConfig{
Enabled: false,
},
Categories: []config.Category{category},
}

Expect(cfg.IsJailbreakEnabledForCategory("test")).To(BeTrue())
})

It("should return false when category explicitly disables jailbreak", func() {
category := config.Category{
Name: "test",
JailbreakEnabled: config.BoolPtr(false),
ModelScores: []config.ModelScore{{Model: "test", Score: 1.0}},
}

cfg := &config.RouterConfig{
PromptGuard: config.PromptGuardConfig{
Enabled: false,
},
Categories: []config.Category{category},
}

Expect(cfg.IsJailbreakEnabledForCategory("test")).To(BeFalse())
})
})

Context("when category does not exist", func() {
It("should fall back to global setting", func() {
cfg := &config.RouterConfig{
PromptGuard: config.PromptGuardConfig{
Enabled: true,
},
Categories: []config.Category{},
}

Expect(cfg.IsJailbreakEnabledForCategory("nonexistent")).To(BeTrue())
})
})
})
})
25 changes: 16 additions & 9 deletions src/semantic-router/pkg/extproc/request_handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -396,12 +396,7 @@ func (r *OpenAIRouter) handleRequestBody(v *ext_proc.ProcessingRequest_RequestBo
// Get content from messages
userContent, nonUserMessages := extractUserAndNonUserContent(openAIRequest)

// Perform security checks
if response, shouldReturn := r.performSecurityChecks(ctx, userContent, nonUserMessages); shouldReturn {
return response, nil
}

// Classify the request early to determine category for cache settings
// Classify the request early to determine category for security checks and cache settings
var categoryName string
if r.Config != nil && r.Config.IsAutoModelName(originalModel) && (len(nonUserMessages) > 0 || userContent != "") {
// Determine text to use for classification
Expand All @@ -417,6 +412,11 @@ func (r *OpenAIRouter) handleRequestBody(v *ext_proc.ProcessingRequest_RequestBo
}
}

// Perform security checks with category-specific settings
if response, shouldReturn := r.performSecurityChecks(ctx, userContent, nonUserMessages, categoryName); shouldReturn {
return response, nil
}

// Handle caching with category-specific settings
if response, shouldReturn := r.handleCaching(ctx, categoryName); shouldReturn {
return response, nil
Expand All @@ -426,13 +426,20 @@ func (r *OpenAIRouter) handleRequestBody(v *ext_proc.ProcessingRequest_RequestBo
return r.handleModelRouting(openAIRequest, originalModel, userContent, nonUserMessages, ctx)
}

// performSecurityChecks performs PII and jailbreak detection
func (r *OpenAIRouter) performSecurityChecks(ctx *RequestContext, userContent string, nonUserMessages []string) (*ext_proc.ProcessingResponse, bool) {
// performSecurityChecks performs PII and jailbreak detection with category-specific settings
func (r *OpenAIRouter) performSecurityChecks(ctx *RequestContext, userContent string, nonUserMessages []string, categoryName string) (*ext_proc.ProcessingResponse, bool) {
// Perform PII classification on all message content
allContent := pii.ExtractAllContent(userContent, nonUserMessages)

// Check if jailbreak detection is enabled for this category
jailbreakEnabled := r.Classifier.IsJailbreakEnabled()
if categoryName != "" && r.Config != nil {
// Use category-specific setting if available
jailbreakEnabled = jailbreakEnabled && r.Config.IsJailbreakEnabledForCategory(categoryName)
}

// Perform jailbreak detection on all message content
if r.Classifier.IsJailbreakEnabled() {
if jailbreakEnabled {
// Start jailbreak detection span
spanCtx, span := observability.StartSpan(ctx.TraceContext, observability.SpanJailbreakDetection)
defer span.End()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ go 1.24.1

replace github.com/vllm-project/semantic-router/candle-binding => ../../../../candle-binding

require github.com/vllm-project/semantic-router/candle-binding v0.0.0-00010101000000-000000000000
require github.com/vllm-project/semantic-router/candle-binding v0.0.0-00010101000000-000000000000
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ go 1.24.1

replace github.com/vllm-project/semantic-router/candle-binding => ../../../../candle-binding

require github.com/vllm-project/semantic-router/candle-binding v0.0.0-00010101000000-000000000000
require github.com/vllm-project/semantic-router/candle-binding v0.0.0-00010101000000-000000000000
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ go 1.24.1

replace github.com/vllm-project/semantic-router/candle-binding => ../../../../candle-binding

require github.com/vllm-project/semantic-router/candle-binding v0.0.0-00010101000000-000000000000
require github.com/vllm-project/semantic-router/candle-binding v0.0.0-00010101000000-000000000000
Loading