Skip to content

Commit bc4cc98

Browse files
CopilotXunzhuo
andauthored
Allow jailbreak detection and threshold to be configured at the category level (#508)
* Initial plan * Add category-level jailbreak detection configuration Co-authored-by: Xunzhuo <[email protected]> * Add documentation for category-level jailbreak settings Co-authored-by: Xunzhuo <[email protected]> * Update documentation for category-level jailbreak detection - Add category-level jailbreak configuration to jailbreak-protection.md - Update category configuration docs with jailbreak_enabled parameter - Add security-focused configuration example - Update global configuration docs with category override notes - Update README to mention fine-grained security control Co-authored-by: Xunzhuo <[email protected]> * Add category-level jailbreak threshold configuration - Add JailbreakThreshold field to Category struct - Add GetJailbreakThresholdForCategory helper method - Create CheckForJailbreakWithThreshold and AnalyzeContentForJailbreakWithThreshold methods - Update performSecurityChecks to use category-specific threshold - Add 5 comprehensive tests for threshold configuration - Update example configs with threshold tuning examples - Update documentation with threshold configuration and tuning guidelines - Add threshold tuning guide with recommendations for different category types Co-authored-by: Xunzhuo <[email protected]> --------- Co-authored-by: copilot-swe-agent[bot] <[email protected]> Co-authored-by: Xunzhuo <[email protected]>
1 parent 7b78326 commit bc4cc98

File tree

13 files changed

+600
-23
lines changed

13 files changed

+600
-23
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ Detect PII in the prompt, avoiding sending PII to the LLM so as to protect the p
7676

7777
#### Prompt guard
7878

79-
Detect if the prompt is a jailbreak prompt, avoiding sending jailbreak prompts to the LLM so as to prevent the LLM from misbehaving.
79+
Detect if the prompt is a jailbreak prompt, avoiding sending jailbreak prompts to the LLM so as to prevent the LLM from misbehaving. Can be configured globally or at the category level for fine-grained security control.
8080

8181
### Similarity Caching ⚡️
8282

config/config.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ tools:
1919
fallback_to_empty: true
2020

2121
prompt_guard:
22-
enabled: true
22+
enabled: true # Global default - can be overridden per category with jailbreak_enabled
2323
use_modernbert: true
2424
model_id: "models/jailbreak_classifier_modernbert-base_model"
2525
threshold: 0.7
@@ -62,6 +62,8 @@ classifier:
6262
categories:
6363
- name: business
6464
system_prompt: "You are a senior business consultant and strategic advisor with expertise in corporate strategy, operations management, financial analysis, marketing, and organizational development. Provide practical, actionable business advice backed by proven methodologies and industry best practices. Consider market dynamics, competitive landscape, and stakeholder interests in your recommendations."
65+
# jailbreak_enabled: true # Optional: Override global jailbreak detection per category
66+
# jailbreak_threshold: 0.8 # Optional: Override global jailbreak threshold per category
6567
model_scores:
6668
- model: qwen3
6769
score: 0.7
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
# Category-Level Jailbreak Detection Example
2+
# This example demonstrates how to configure jailbreak detection at the category level
3+
# Different categories can have different jailbreak detection settings and thresholds based on their risk profiles
4+
5+
# Global jailbreak detection configuration (can be overridden per category)
6+
prompt_guard:
7+
enabled: true # Global default - can be overridden per category
8+
use_modernbert: true
9+
model_id: "models/jailbreak_classifier_modernbert-base_model"
10+
threshold: 0.7 # Global default threshold - can be overridden per category
11+
use_cpu: true
12+
jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"
13+
14+
# Classifier configuration
15+
classifier:
16+
category_model:
17+
model_id: "models/category_classifier_modernbert-base_model"
18+
use_modernbert: true
19+
threshold: 0.6
20+
use_cpu: true
21+
category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json"
22+
23+
# Categories with different jailbreak detection settings
24+
categories:
25+
# High-security category: Strict jailbreak detection with high threshold
26+
- name: business
27+
description: "Business queries, strategy, and professional advice"
28+
jailbreak_enabled: true # Explicitly enable (inherits from global by default)
29+
jailbreak_threshold: 0.9 # Higher threshold for stricter detection
30+
system_prompt: "You are a professional business consultant. Provide practical, actionable business advice."
31+
model_scores:
32+
- model: qwen3
33+
score: 0.7
34+
use_reasoning: false
35+
36+
# Public-facing category: Enable with standard threshold
37+
- name: customer_support
38+
description: "Customer support and general inquiries"
39+
jailbreak_enabled: true # Explicitly enable for customer-facing content
40+
jailbreak_threshold: 0.8 # Slightly higher than global for public-facing
41+
system_prompt: "You are a friendly customer support agent. Help users with their questions."
42+
model_scores:
43+
- model: qwen3
44+
score: 0.8
45+
use_reasoning: false
46+
47+
# Internal tool category: Relaxed threshold (trusted environment)
48+
- name: code_generation
49+
description: "Internal code generation and development tools"
50+
jailbreak_enabled: true # Keep enabled but with relaxed threshold
51+
jailbreak_threshold: 0.5 # Lower threshold to reduce false positives for code
52+
system_prompt: "You are a code generation assistant for internal developers."
53+
model_scores:
54+
- model: qwen3
55+
score: 0.9
56+
use_reasoning: true
57+
58+
# Testing category: Disable jailbreak detection
59+
- name: testing
60+
description: "Testing and quality assurance queries"
61+
jailbreak_enabled: false # Disable for testing purposes
62+
system_prompt: "You are a QA assistant helping with test scenarios."
63+
model_scores:
64+
- model: qwen3
65+
score: 0.6
66+
use_reasoning: false
67+
68+
# Default category: Uses global setting (inherits prompt_guard.enabled and threshold)
69+
- name: general
70+
description: "General queries that don't fit into specific categories"
71+
# jailbreak_enabled not specified - will inherit from global prompt_guard.enabled
72+
# jailbreak_threshold not specified - will inherit from global prompt_guard.threshold (0.7)
73+
system_prompt: "You are a helpful assistant."
74+
model_scores:
75+
- model: qwen3
76+
score: 0.5
77+
use_reasoning: false
78+
79+
# Model configuration
80+
model_config:
81+
"qwen3":
82+
reasoning_family: "qwen3"
83+
preferred_endpoints: ["endpoint1"]
84+
pii_policy:
85+
allow_by_default: true
86+
87+
# Reasoning family configurations
88+
reasoning_families:
89+
qwen3:
90+
type: "chat_template_kwargs"
91+
parameter: "thinking"
92+
93+
# Default model for fallback
94+
default_model: qwen3
95+
96+
# vLLM endpoints configuration
97+
vllm_endpoints:
98+
- name: "endpoint1"
99+
address: "127.0.0.1"
100+
port: 8000
101+
weight: 1
102+
103+
# Usage Notes:
104+
# =============
105+
# 1. Global Settings:
106+
# - prompt_guard.enabled: Sets the default enabled/disabled for all categories
107+
# - prompt_guard.threshold: Sets the default detection threshold (0.0-1.0) for all categories
108+
# 2. Category Overrides:
109+
# - jailbreak_enabled: Override global enabled/disabled setting per category
110+
# - jailbreak_threshold: Override global threshold per category
111+
# 3. Inheritance:
112+
# - If jailbreak_enabled is not specified, inherits from prompt_guard.enabled
113+
# - If jailbreak_threshold is not specified, inherits from prompt_guard.threshold
114+
# 4. Threshold Tuning:
115+
# - Higher threshold (0.8-0.95): Stricter detection, fewer false positives, may miss subtle attacks
116+
# - Lower threshold (0.5-0.7): More sensitive detection, catches more attacks, higher false positive rate
117+
# - Recommended: Start with 0.7 globally, adjust per category based on risk profile
118+
# 5. Use Cases:
119+
# - High-security categories (business, customer_support): Use higher thresholds (0.8-0.9)
120+
# - Internal tools with code/technical content: Use lower thresholds (0.5-0.6) to reduce false positives
121+
# - General categories: Use global default threshold
122+
# 6. Security Best Practices:
123+
# - Enable jailbreak detection by default (prompt_guard.enabled: true)
124+
# - Only disable or use very low thresholds for specific categories where the risk is managed differently
125+
# - Consider the consequences of threshold settings on a per-category basis
126+
# - Monitor false positive and false negative rates to tune thresholds appropriately

src/semantic-router/pkg/config/config.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -370,6 +370,12 @@ type Category struct {
370370
// SemanticCacheSimilarityThreshold defines the minimum similarity score for cache hits (0.0-1.0)
371371
// If nil, uses the global threshold from SemanticCache.SimilarityThreshold or BertModel.Threshold
372372
SemanticCacheSimilarityThreshold *float32 `yaml:"semantic_cache_similarity_threshold,omitempty"`
373+
// JailbreakEnabled controls whether jailbreak detection is enabled for this category
374+
// If nil, inherits from global PromptGuard.Enabled setting
375+
JailbreakEnabled *bool `yaml:"jailbreak_enabled,omitempty"`
376+
// JailbreakThreshold defines the confidence threshold for jailbreak detection (0.0-1.0)
377+
// If nil, uses the global threshold from PromptGuard.Threshold
378+
JailbreakThreshold *float32 `yaml:"jailbreak_threshold,omitempty"`
373379
}
374380

375381
// GetModelReasoningFamily returns the reasoning family configuration for a given model name
@@ -815,3 +821,25 @@ func (c *RouterConfig) GetCacheSimilarityThresholdForCategory(categoryName strin
815821
// Fall back to global cache threshold or bert threshold
816822
return c.GetCacheSimilarityThreshold()
817823
}
824+
825+
// IsJailbreakEnabledForCategory returns whether jailbreak detection is enabled for a specific category
826+
// If the category has an explicit setting, it takes precedence; otherwise, uses global setting
827+
func (c *RouterConfig) IsJailbreakEnabledForCategory(categoryName string) bool {
828+
category := c.GetCategoryByName(categoryName)
829+
if category != nil && category.JailbreakEnabled != nil {
830+
return *category.JailbreakEnabled
831+
}
832+
// Fall back to global setting
833+
return c.PromptGuard.Enabled
834+
}
835+
836+
// GetJailbreakThresholdForCategory returns the effective jailbreak detection threshold for a category
837+
// Priority: category-specific > global prompt_guard threshold
838+
func (c *RouterConfig) GetJailbreakThresholdForCategory(categoryName string) float32 {
839+
category := c.GetCategoryByName(categoryName)
840+
if category != nil && category.JailbreakThreshold != nil {
841+
return *category.JailbreakThreshold
842+
}
843+
// Fall back to global threshold
844+
return c.PromptGuard.Threshold
845+
}

0 commit comments

Comments
 (0)