Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ Detect PII in the prompt, avoiding sending PII to the LLM so as to protect the p

#### Prompt guard

Detect if the prompt is a jailbreak prompt, avoiding sending jailbreak prompts to the LLM so as to prevent the LLM from misbehaving.
Detect if the prompt is a jailbreak prompt, avoiding sending jailbreak prompts to the LLM so as to prevent the LLM from misbehaving. Can be configured globally or at the category level for fine-grained security control.

### Similarity Caching ⚡️

Expand Down
4 changes: 3 additions & 1 deletion config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ tools:
fallback_to_empty: true

prompt_guard:
enabled: true
enabled: true # Global default - can be overridden per category with jailbreak_enabled
use_modernbert: true
model_id: "models/jailbreak_classifier_modernbert-base_model"
threshold: 0.7
Expand Down Expand Up @@ -62,6 +62,8 @@ classifier:
categories:
- name: business
system_prompt: "You are a senior business consultant and strategic advisor with expertise in corporate strategy, operations management, financial analysis, marketing, and organizational development. Provide practical, actionable business advice backed by proven methodologies and industry best practices. Consider market dynamics, competitive landscape, and stakeholder interests in your recommendations."
# jailbreak_enabled: true # Optional: Override global jailbreak detection per category
# jailbreak_threshold: 0.8 # Optional: Override global jailbreak threshold per category
model_scores:
- model: qwen3
score: 0.7
Expand Down
126 changes: 126 additions & 0 deletions config/examples/jailbreak_category_example.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
# Category-Level Jailbreak Detection Example
# This example demonstrates how to configure jailbreak detection at the category level
# Different categories can have different jailbreak detection settings and thresholds based on their risk profiles

# Global jailbreak detection configuration (can be overridden per category)
prompt_guard:
enabled: true # Global default - can be overridden per category
use_modernbert: true
model_id: "models/jailbreak_classifier_modernbert-base_model"
threshold: 0.7 # Global default threshold - can be overridden per category
use_cpu: true
jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"

# Classifier configuration
classifier:
category_model:
model_id: "models/category_classifier_modernbert-base_model"
use_modernbert: true
threshold: 0.6
use_cpu: true
category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json"

# Categories with different jailbreak detection settings
categories:
# High-security category: Strict jailbreak detection with high threshold
- name: business
description: "Business queries, strategy, and professional advice"
jailbreak_enabled: true # Explicitly enable (inherits from global by default)
jailbreak_threshold: 0.9 # Higher threshold for stricter detection
system_prompt: "You are a professional business consultant. Provide practical, actionable business advice."
model_scores:
- model: qwen3
score: 0.7
use_reasoning: false

# Public-facing category: Enable with standard threshold
- name: customer_support
description: "Customer support and general inquiries"
jailbreak_enabled: true # Explicitly enable for customer-facing content
jailbreak_threshold: 0.8 # Slightly higher than global for public-facing
system_prompt: "You are a friendly customer support agent. Help users with their questions."
model_scores:
- model: qwen3
score: 0.8
use_reasoning: false

# Internal tool category: Relaxed threshold (trusted environment)
- name: code_generation
description: "Internal code generation and development tools"
jailbreak_enabled: true # Keep enabled but with relaxed threshold
jailbreak_threshold: 0.5 # Lower threshold to reduce false positives for code
system_prompt: "You are a code generation assistant for internal developers."
model_scores:
- model: qwen3
score: 0.9
use_reasoning: true

# Testing category: Disable jailbreak detection
- name: testing
description: "Testing and quality assurance queries"
jailbreak_enabled: false # Disable for testing purposes
system_prompt: "You are a QA assistant helping with test scenarios."
model_scores:
- model: qwen3
score: 0.6
use_reasoning: false

# Default category: Uses global setting (inherits prompt_guard.enabled and threshold)
- name: general
description: "General queries that don't fit into specific categories"
# jailbreak_enabled not specified - will inherit from global prompt_guard.enabled
# jailbreak_threshold not specified - will inherit from global prompt_guard.threshold (0.7)
system_prompt: "You are a helpful assistant."
model_scores:
- model: qwen3
score: 0.5
use_reasoning: false

# Model configuration
model_config:
"qwen3":
reasoning_family: "qwen3"
preferred_endpoints: ["endpoint1"]
pii_policy:
allow_by_default: true

# Reasoning family configurations
reasoning_families:
qwen3:
type: "chat_template_kwargs"
parameter: "thinking"

# Default model for fallback
default_model: qwen3

# vLLM endpoints configuration
vllm_endpoints:
- name: "endpoint1"
address: "127.0.0.1"
port: 8000
weight: 1

# Usage Notes:
# =============
# 1. Global Settings:
# - prompt_guard.enabled: Sets the default enabled/disabled for all categories
# - prompt_guard.threshold: Sets the default detection threshold (0.0-1.0) for all categories
# 2. Category Overrides:
# - jailbreak_enabled: Override global enabled/disabled setting per category
# - jailbreak_threshold: Override global threshold per category
# 3. Inheritance:
# - If jailbreak_enabled is not specified, inherits from prompt_guard.enabled
# - If jailbreak_threshold is not specified, inherits from prompt_guard.threshold
# 4. Threshold Tuning:
# - Higher threshold (0.8-0.95): Stricter detection, fewer false positives, may miss subtle attacks
# - Lower threshold (0.5-0.7): More sensitive detection, catches more attacks, higher false positive rate
# - Recommended: Start with 0.7 globally, adjust per category based on risk profile
# 5. Use Cases:
# - High-security categories (business, customer_support): Use higher thresholds (0.8-0.9)
# - Internal tools with code/technical content: Use lower thresholds (0.5-0.6) to reduce false positives
# - General categories: Use global default threshold
# 6. Security Best Practices:
# - Enable jailbreak detection by default (prompt_guard.enabled: true)
# - Only disable or use very low thresholds for specific categories where the risk is managed differently
# - Consider the consequences of threshold settings on a per-category basis
# - Monitor false positive and false negative rates to tune thresholds appropriately
28 changes: 28 additions & 0 deletions src/semantic-router/pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,12 @@ type Category struct {
// SemanticCacheSimilarityThreshold defines the minimum similarity score for cache hits (0.0-1.0)
// If nil, uses the global threshold from SemanticCache.SimilarityThreshold or BertModel.Threshold
SemanticCacheSimilarityThreshold *float32 `yaml:"semantic_cache_similarity_threshold,omitempty"`
// JailbreakEnabled controls whether jailbreak detection is enabled for this category
// If nil, inherits from global PromptGuard.Enabled setting
JailbreakEnabled *bool `yaml:"jailbreak_enabled,omitempty"`
// JailbreakThreshold defines the confidence threshold for jailbreak detection (0.0-1.0)
// If nil, uses the global threshold from PromptGuard.Threshold
JailbreakThreshold *float32 `yaml:"jailbreak_threshold,omitempty"`
}

// GetModelReasoningFamily returns the reasoning family configuration for a given model name
Expand Down Expand Up @@ -815,3 +821,25 @@ func (c *RouterConfig) GetCacheSimilarityThresholdForCategory(categoryName strin
// Fall back to global cache threshold or bert threshold
return c.GetCacheSimilarityThreshold()
}

// IsJailbreakEnabledForCategory returns whether jailbreak detection is enabled for a specific category
// If the category has an explicit setting, it takes precedence; otherwise, uses global setting
func (c *RouterConfig) IsJailbreakEnabledForCategory(categoryName string) bool {
category := c.GetCategoryByName(categoryName)
if category != nil && category.JailbreakEnabled != nil {
return *category.JailbreakEnabled
}
// Fall back to global setting
return c.PromptGuard.Enabled
}

// GetJailbreakThresholdForCategory returns the effective jailbreak detection threshold for a category
// Priority: category-specific > global prompt_guard threshold
func (c *RouterConfig) GetJailbreakThresholdForCategory(categoryName string) float32 {
category := c.GetCategoryByName(categoryName)
if category != nil && category.JailbreakThreshold != nil {
return *category.JailbreakThreshold
}
// Fall back to global threshold
return c.PromptGuard.Threshold
}
Loading
Loading