Add category-level jailbreak threshold configuration

Copilot · Xunzhuo · Copilot · commit ea50e6c9caba · 2025-10-22T07:48:58.000Z
- Add JailbreakThreshold field to Category struct
- Add GetJailbreakThresholdForCategory helper method
- Create CheckForJailbreakWithThreshold and AnalyzeContentForJailbreakWithThreshold methods
- Update performSecurityChecks to use category-specific threshold
- Add 5 comprehensive tests for threshold configuration
- Update example configs with threshold tuning examples
- Update documentation with threshold configuration and tuning guidelines
- Add threshold tuning guide with recommendations for different category types

Co-authored-by: Xunzhuo &lt;48784001+Xunzhuo@users.noreply.github.com&gt;
diff --git a/config/config.yaml b/config/config.yaml
@@ -63,6 +63,7 @@ categories:
   - name: business
     system_prompt: "You are a senior business consultant and strategic advisor with expertise in corporate strategy, operations management, financial analysis, marketing, and organizational development. Provide practical, actionable business advice backed by proven methodologies and industry best practices. Consider market dynamics, competitive landscape, and stakeholder interests in your recommendations."
     # jailbreak_enabled: true  # Optional: Override global jailbreak detection per category
+    # jailbreak_threshold: 0.8  # Optional: Override global jailbreak threshold per category
     model_scores:
       - model: qwen3
         score: 0.7
diff --git a/config/examples/jailbreak_category_example.yaml b/config/examples/jailbreak_category_example.yaml
@@ -1,13 +1,13 @@
 # Category-Level Jailbreak Detection Example
 # This example demonstrates how to configure jailbreak detection at the category level
-# Different categories can have different jailbreak detection settings based on their risk profiles
+# Different categories can have different jailbreak detection settings and thresholds based on their risk profiles
 
 # Global jailbreak detection configuration (can be overridden per category)
 prompt_guard:
   enabled: true  # Global default - can be overridden per category
   use_modernbert: true
   model_id: "models/jailbreak_classifier_modernbert-base_model"
-  threshold: 0.7
+  threshold: 0.7  # Global default threshold - can be overridden per category
   use_cpu: true
   jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"
 
@@ -22,30 +22,33 @@ classifier:
 
 # Categories with different jailbreak detection settings
 categories:
-  # High-security category: Enable jailbreak detection
+  # High-security category: Strict jailbreak detection with high threshold
   - name: business
     description: "Business queries, strategy, and professional advice"
     jailbreak_enabled: true  # Explicitly enable (inherits from global by default)
+    jailbreak_threshold: 0.9  # Higher threshold for stricter detection
     system_prompt: "You are a professional business consultant. Provide practical, actionable business advice."
     model_scores:
       - model: qwen3
         score: 0.7
         use_reasoning: false
 
-  # Public-facing category: Enable jailbreak detection
+  # Public-facing category: Enable with standard threshold
   - name: customer_support
     description: "Customer support and general inquiries"
     jailbreak_enabled: true  # Explicitly enable for customer-facing content
+    jailbreak_threshold: 0.8  # Slightly higher than global for public-facing
     system_prompt: "You are a friendly customer support agent. Help users with their questions."
     model_scores:
       - model: qwen3
         score: 0.8
         use_reasoning: false
 
-  # Internal tool category: Disable jailbreak detection (trusted environment)
+  # Internal tool category: Relaxed threshold (trusted environment)
   - name: code_generation
     description: "Internal code generation and development tools"
-    jailbreak_enabled: false  # Disable for internal developer tools
+    jailbreak_enabled: true  # Keep enabled but with relaxed threshold
+    jailbreak_threshold: 0.5  # Lower threshold to reduce false positives for code
     system_prompt: "You are a code generation assistant for internal developers."
     model_scores:
       - model: qwen3
@@ -62,10 +65,11 @@ categories:
         score: 0.6
         use_reasoning: false
 
-  # Default category: Uses global setting (inherits prompt_guard.enabled)
+  # Default category: Uses global setting (inherits prompt_guard.enabled and threshold)
   - name: general
     description: "General queries that don't fit into specific categories"
     # jailbreak_enabled not specified - will inherit from global prompt_guard.enabled
+    # jailbreak_threshold not specified - will inherit from global prompt_guard.threshold (0.7)
     system_prompt: "You are a helpful assistant."
     model_scores:
       - model: qwen3
@@ -98,14 +102,25 @@ vllm_endpoints:
 
 # Usage Notes:
 # =============
-# 1. Global Setting (prompt_guard.enabled): Sets the default for all categories
-# 2. Category Override (jailbreak_enabled): Override global setting per category
-# 3. Inheritance: If jailbreak_enabled is not specified, inherits from prompt_guard.enabled
-# 4. Use Cases:
-#    - Set jailbreak_enabled: true for high-security, public-facing categories
-#    - Set jailbreak_enabled: false for internal tools or trusted environments
-#    - Omit jailbreak_enabled to use the global default
-# 5. Security Best Practices:
+# 1. Global Settings:
+#    - prompt_guard.enabled: Sets the default enabled/disabled for all categories
+#    - prompt_guard.threshold: Sets the default detection threshold (0.0-1.0) for all categories
+# 2. Category Overrides:
+#    - jailbreak_enabled: Override global enabled/disabled setting per category
+#    - jailbreak_threshold: Override global threshold per category
+# 3. Inheritance:
+#    - If jailbreak_enabled is not specified, inherits from prompt_guard.enabled
+#    - If jailbreak_threshold is not specified, inherits from prompt_guard.threshold
+# 4. Threshold Tuning:
+#    - Higher threshold (0.8-0.95): Stricter detection, fewer false positives, may miss subtle attacks
+#    - Lower threshold (0.5-0.7): More sensitive detection, catches more attacks, higher false positive rate
+#    - Recommended: Start with 0.7 globally, adjust per category based on risk profile
+# 5. Use Cases:
+#    - High-security categories (business, customer_support): Use higher thresholds (0.8-0.9)
+#    - Internal tools with code/technical content: Use lower thresholds (0.5-0.6) to reduce false positives
+#    - General categories: Use global default threshold
+# 6. Security Best Practices:
 #    - Enable jailbreak detection by default (prompt_guard.enabled: true)
-#    - Only disable for specific categories where the risk is managed differently
-#    - Consider the consequences of disabling protection on a per-category basis
+#    - Only disable or use very low thresholds for specific categories where the risk is managed differently
+#    - Consider the consequences of threshold settings on a per-category basis
+#    - Monitor false positive and false negative rates to tune thresholds appropriately
diff --git a/src/semantic-router/pkg/config/config.go b/src/semantic-router/pkg/config/config.go
@@ -373,6 +373,9 @@ type Category struct {
 	// JailbreakEnabled controls whether jailbreak detection is enabled for this category
 	// If nil, inherits from global PromptGuard.Enabled setting
 	JailbreakEnabled *bool `yaml:"jailbreak_enabled,omitempty"`
+	// JailbreakThreshold defines the confidence threshold for jailbreak detection (0.0-1.0)
+	// If nil, uses the global threshold from PromptGuard.Threshold
+	JailbreakThreshold *float32 `yaml:"jailbreak_threshold,omitempty"`
 }
 
 // GetModelReasoningFamily returns the reasoning family configuration for a given model name
@@ -829,3 +832,14 @@ func (c *RouterConfig) IsJailbreakEnabledForCategory(categoryName string) bool {
 	// Fall back to global setting
 	return c.PromptGuard.Enabled
 }
+
+// GetJailbreakThresholdForCategory returns the effective jailbreak detection threshold for a category
+// Priority: category-specific > global prompt_guard threshold
+func (c *RouterConfig) GetJailbreakThresholdForCategory(categoryName string) float32 {
+	category := c.GetCategoryByName(categoryName)
+	if category != nil && category.JailbreakThreshold != nil {
+		return *category.JailbreakThreshold
+	}
+	// Fall back to global threshold
+	return c.PromptGuard.Threshold
+}
diff --git a/src/semantic-router/pkg/config/config_test.go b/src/semantic-router/pkg/config/config_test.go
@@ -2029,4 +2029,88 @@ categories:
 			})
 		})
 	})
+
+	Describe("GetJailbreakThresholdForCategory", func() {
+		Context("when global threshold is set", func() {
+			It("should return global threshold for category without explicit setting", func() {
+				category := config.Category{
+					Name:        "test",
+					ModelScores: []config.ModelScore{{Model: "test", Score: 1.0}},
+				}
+
+				cfg := &config.RouterConfig{
+					PromptGuard: config.PromptGuardConfig{
+						Threshold: 0.7,
+					},
+					Categories: []config.Category{category},
+				}
+
+				Expect(cfg.GetJailbreakThresholdForCategory("test")).To(Equal(float32(0.7)))
+			})
+
+			It("should return category-specific threshold when set", func() {
+				category := config.Category{
+					Name:               "test",
+					JailbreakThreshold: config.Float32Ptr(0.9),
+					ModelScores:        []config.ModelScore{{Model: "test", Score: 1.0}},
+				}
+
+				cfg := &config.RouterConfig{
+					PromptGuard: config.PromptGuardConfig{
+						Threshold: 0.7,
+					},
+					Categories: []config.Category{category},
+				}
+
+				Expect(cfg.GetJailbreakThresholdForCategory("test")).To(Equal(float32(0.9)))
+			})
+
+			It("should allow lower threshold override", func() {
+				category := config.Category{
+					Name:               "test",
+					JailbreakThreshold: config.Float32Ptr(0.5),
+					ModelScores:        []config.ModelScore{{Model: "test", Score: 1.0}},
+				}
+
+				cfg := &config.RouterConfig{
+					PromptGuard: config.PromptGuardConfig{
+						Threshold: 0.7,
+					},
+					Categories: []config.Category{category},
+				}
+
+				Expect(cfg.GetJailbreakThresholdForCategory("test")).To(Equal(float32(0.5)))
+			})
+
+			It("should allow higher threshold override", func() {
+				category := config.Category{
+					Name:               "test",
+					JailbreakThreshold: config.Float32Ptr(0.95),
+					ModelScores:        []config.ModelScore{{Model: "test", Score: 1.0}},
+				}
+
+				cfg := &config.RouterConfig{
+					PromptGuard: config.PromptGuardConfig{
+						Threshold: 0.7,
+					},
+					Categories: []config.Category{category},
+				}
+
+				Expect(cfg.GetJailbreakThresholdForCategory("test")).To(Equal(float32(0.95)))
+			})
+		})
+
+		Context("when category does not exist", func() {
+			It("should fall back to global threshold", func() {
+				cfg := &config.RouterConfig{
+					PromptGuard: config.PromptGuardConfig{
+						Threshold: 0.8,
+					},
+					Categories: []config.Category{},
+				}
+
+				Expect(cfg.GetJailbreakThresholdForCategory("nonexistent")).To(Equal(float32(0.8)))
+			})
+		})
+	})
 })
diff --git a/src/semantic-router/pkg/extproc/request_handler.go b/src/semantic-router/pkg/extproc/request_handler.go
@@ -438,14 +438,20 @@ func (r *OpenAIRouter) performSecurityChecks(ctx *RequestContext, userContent st
 		jailbreakEnabled = jailbreakEnabled && r.Config.IsJailbreakEnabledForCategory(categoryName)
 	}
 
+	// Get category-specific threshold
+	jailbreakThreshold := r.Config.PromptGuard.Threshold
+	if categoryName != "" && r.Config != nil {
+		jailbreakThreshold = r.Config.GetJailbreakThresholdForCategory(categoryName)
+	}
+
 	// Perform jailbreak detection on all message content
 	if jailbreakEnabled {
 		// Start jailbreak detection span
 		spanCtx, span := observability.StartSpan(ctx.TraceContext, observability.SpanJailbreakDetection)
 		defer span.End()
 
 		startTime := time.Now()
-		hasJailbreak, jailbreakDetections, err := r.Classifier.AnalyzeContentForJailbreak(allContent)
+		hasJailbreak, jailbreakDetections, err := r.Classifier.AnalyzeContentForJailbreakWithThreshold(allContent, jailbreakThreshold)
 		detectionTime := time.Since(startTime).Milliseconds()
 
 		observability.SetSpanAttributes(span,
diff --git a/src/semantic-router/pkg/utils/classification/classifier.go b/src/semantic-router/pkg/utils/classification/classifier.go
@@ -425,6 +425,11 @@ func (c *Classifier) initializeJailbreakClassifier() error {
 
 // CheckForJailbreak analyzes the given text for jailbreak attempts
 func (c *Classifier) CheckForJailbreak(text string) (bool, string, float32, error) {
+	return c.CheckForJailbreakWithThreshold(text, c.Config.PromptGuard.Threshold)
+}
+
+// CheckForJailbreakWithThreshold analyzes the given text for jailbreak attempts with a custom threshold
+func (c *Classifier) CheckForJailbreakWithThreshold(text string, threshold float32) (bool, string, float32, error) {
 	if !c.IsJailbreakEnabled() {
 		return false, "", 0.0, fmt.Errorf("jailbreak detection is not enabled or properly configured")
 	}
@@ -453,21 +458,26 @@ func (c *Classifier) CheckForJailbreak(text string) (bool, string, float32, erro
 	}
 
 	// Check if confidence meets threshold and indicates jailbreak
-	isJailbreak := result.Confidence >= c.Config.PromptGuard.Threshold && jailbreakType == "jailbreak"
+	isJailbreak := result.Confidence >= threshold && jailbreakType == "jailbreak"
 
 	if isJailbreak {
 		observability.Warnf("JAILBREAK DETECTED: '%s' (confidence: %.3f, threshold: %.3f)",
-			jailbreakType, result.Confidence, c.Config.PromptGuard.Threshold)
+			jailbreakType, result.Confidence, threshold)
 	} else {
 		observability.Infof("BENIGN: '%s' (confidence: %.3f, threshold: %.3f)",
-			jailbreakType, result.Confidence, c.Config.PromptGuard.Threshold)
+			jailbreakType, result.Confidence, threshold)
 	}
 
 	return isJailbreak, jailbreakType, result.Confidence, nil
 }
 
 // AnalyzeContentForJailbreak analyzes multiple content pieces for jailbreak attempts
 func (c *Classifier) AnalyzeContentForJailbreak(contentList []string) (bool, []JailbreakDetection, error) {
+	return c.AnalyzeContentForJailbreakWithThreshold(contentList, c.Config.PromptGuard.Threshold)
+}
+
+// AnalyzeContentForJailbreakWithThreshold analyzes multiple content pieces for jailbreak attempts with a custom threshold
+func (c *Classifier) AnalyzeContentForJailbreakWithThreshold(contentList []string, threshold float32) (bool, []JailbreakDetection, error) {
 	if !c.IsJailbreakEnabled() {
 		return false, nil, fmt.Errorf("jailbreak detection is not enabled or properly configured")
 	}
@@ -480,7 +490,7 @@ func (c *Classifier) AnalyzeContentForJailbreak(contentList []string) (bool, []J
 			continue
 		}
 
-		isJailbreak, jailbreakType, confidence, err := c.CheckForJailbreak(content)
+		isJailbreak, jailbreakType, confidence, err := c.CheckForJailbreakWithThreshold(content, threshold)
 		if err != nil {
 			observability.Errorf("Error analyzing content %d: %v", i, err)
 			continue
diff --git a/website/docs/overview/categories/configuration.md b/website/docs/overview/categories/configuration.md
@@ -111,6 +111,42 @@ categories:
         score: 0.5
 ```
 
+#### `jailbreak_threshold` (Optional)
+
+- **Type**: Float (0.0-1.0)
+- **Description**: Confidence threshold for jailbreak detection
+- **Default**: Inherits from global `prompt_guard.threshold` setting
+- **Impact**: Controls sensitivity of jailbreak detection for this category
+- **Tuning**: Higher values = stricter (fewer false positives), Lower values = more sensitive (catches more attacks)
+
+```yaml
+categories:
+  - name: customer_support
+    jailbreak_enabled: true
+    jailbreak_threshold: 0.9  # Strict detection for public-facing
+    model_scores:
+      - model: qwen3
+        score: 0.8
+
+  - name: code_generation
+    jailbreak_enabled: true
+    jailbreak_threshold: 0.5  # Relaxed to reduce false positives on code
+    model_scores:
+      - model: qwen3
+        score: 0.9
+
+  - name: general
+    # No jailbreak_threshold - inherits from global prompt_guard.threshold
+    model_scores:
+      - model: qwen3
+        score: 0.5
+```
+
+**Threshold Guidelines**:
+- **0.8-0.95**: High-security categories (customer support, business)
+- **0.6-0.8**: Standard categories (general queries)
+- **0.4-0.6**: Technical categories (code generation, development tools)
+
 #### `use_reasoning` (Required)
 
 - **Type**: Boolean
@@ -228,21 +264,23 @@ categories:
 
 ```yaml
 categories:
-  # High-security public-facing category
+  # High-security public-facing category with strict threshold
   - name: "customer_support"
     description: "Customer support and general inquiries"
     jailbreak_enabled: true  # Strict jailbreak protection
+    jailbreak_threshold: 0.9  # High threshold for public-facing
     use_reasoning: false
     model_scores:
       - model: "phi4"
         score: 0.9
       - model: "mistral-small3.1"
         score: 0.7
 
-  # Trusted internal development category
+  # Technical category with relaxed threshold
   - name: "code_generation"
-    description: "Internal code generation for developers"
-    jailbreak_enabled: false  # Allow broader input for trusted users
+    description: "Code generation for developers"
+    jailbreak_enabled: true  # Keep enabled
+    jailbreak_threshold: 0.5  # Lower threshold to reduce false positives on code
     use_reasoning: true
     reasoning_effort: "medium"
     model_scores:
diff --git a/website/docs/tutorials/content-safety/jailbreak-protection.md b/website/docs/tutorials/content-safety/jailbreak-protection.md