vllm-project · Xunzhuo · Oct 22, 2025 · Oct 22, 2025 · Oct 22, 2025 · Oct 22, 2025
@@ -76,7 +76,7 @@ Detect PII in the prompt, avoiding sending PII to the LLM so as to protect the p
 
 #### Prompt guard
 
-Detect if the prompt is a jailbreak prompt, avoiding sending jailbreak prompts to the LLM so as to prevent the LLM from misbehaving.
+Detect if the prompt is a jailbreak prompt, avoiding sending jailbreak prompts to the LLM so as to prevent the LLM from misbehaving. Can be configured globally or at the category level for fine-grained security control.
 
 ### Similarity Caching ⚡️
 

@@ -19,7 +19,7 @@ tools:
   fallback_to_empty: true
 
 prompt_guard:
-  enabled: true
+  enabled: true  # Global default - can be overridden per category with jailbreak_enabled
   use_modernbert: true
   model_id: "models/jailbreak_classifier_modernbert-base_model"
   threshold: 0.7
@@ -62,6 +62,8 @@ classifier:
 categories:
   - name: business
     system_prompt: "You are a senior business consultant and strategic advisor with expertise in corporate strategy, operations management, financial analysis, marketing, and organizational development. Provide practical, actionable business advice backed by proven methodologies and industry best practices. Consider market dynamics, competitive landscape, and stakeholder interests in your recommendations."
+    # jailbreak_enabled: true  # Optional: Override global jailbreak detection per category
+    # jailbreak_threshold: 0.8  # Optional: Override global jailbreak threshold per category
     model_scores:
       - model: qwen3
         score: 0.7

@@ -0,0 +1,126 @@
+# Category-Level Jailbreak Detection Example
+# This example demonstrates how to configure jailbreak detection at the category level
+# Different categories can have different jailbreak detection settings and thresholds based on their risk profiles
+
+# Global jailbreak detection configuration (can be overridden per category)
+prompt_guard:
+  enabled: true  # Global default - can be overridden per category
+  use_modernbert: true
+  model_id: "models/jailbreak_classifier_modernbert-base_model"
+  threshold: 0.7  # Global default threshold - can be overridden per category
+  use_cpu: true
+  jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"
+
+# Classifier configuration
+classifier:
+  category_model:
+    model_id: "models/category_classifier_modernbert-base_model"
+    use_modernbert: true
+    threshold: 0.6
+    use_cpu: true
+    category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json"
+
+# Categories with different jailbreak detection settings
+categories:
+  # High-security category: Strict jailbreak detection with high threshold
+  - name: business
+    description: "Business queries, strategy, and professional advice"
+    jailbreak_enabled: true  # Explicitly enable (inherits from global by default)
+    jailbreak_threshold: 0.9  # Higher threshold for stricter detection
+    system_prompt: "You are a professional business consultant. Provide practical, actionable business advice."
+    model_scores:
+      - model: qwen3
+        score: 0.7
+        use_reasoning: false
+
+  # Public-facing category: Enable with standard threshold
+  - name: customer_support
+    description: "Customer support and general inquiries"
+    jailbreak_enabled: true  # Explicitly enable for customer-facing content
+    jailbreak_threshold: 0.8  # Slightly higher than global for public-facing
+    system_prompt: "You are a friendly customer support agent. Help users with their questions."
+    model_scores:
+      - model: qwen3
+        score: 0.8
+        use_reasoning: false
+
+  # Internal tool category: Relaxed threshold (trusted environment)
+  - name: code_generation
+    description: "Internal code generation and development tools"
+    jailbreak_enabled: true  # Keep enabled but with relaxed threshold
+    jailbreak_threshold: 0.5  # Lower threshold to reduce false positives for code
+    system_prompt: "You are a code generation assistant for internal developers."
+    model_scores:
+      - model: qwen3
+        score: 0.9
+        use_reasoning: true
+
+  # Testing category: Disable jailbreak detection
+  - name: testing
+    description: "Testing and quality assurance queries"
+    jailbreak_enabled: false  # Disable for testing purposes
+    system_prompt: "You are a QA assistant helping with test scenarios."
+    model_scores:
+      - model: qwen3
+        score: 0.6
+        use_reasoning: false
+
+  # Default category: Uses global setting (inherits prompt_guard.enabled and threshold)
+  - name: general
+    description: "General queries that don't fit into specific categories"
+    # jailbreak_enabled not specified - will inherit from global prompt_guard.enabled
+    # jailbreak_threshold not specified - will inherit from global prompt_guard.threshold (0.7)
+    system_prompt: "You are a helpful assistant."
+    model_scores:
+      - model: qwen3
+        score: 0.5
+        use_reasoning: false
+
+# Model configuration
+model_config:
+  "qwen3":
+    reasoning_family: "qwen3"
+    preferred_endpoints: ["endpoint1"]
+    pii_policy:
+      allow_by_default: true
+
+# Reasoning family configurations
+reasoning_families:
+  qwen3:
+    type: "chat_template_kwargs"
+    parameter: "thinking"
+
+# Default model for fallback
+default_model: qwen3
+
+# vLLM endpoints configuration
+vllm_endpoints:
+  - name: "endpoint1"
+    address: "127.0.0.1"
+    port: 8000
+    weight: 1
+
+# Usage Notes:
+# =============
+# 1. Global Settings:
+#    - prompt_guard.enabled: Sets the default enabled/disabled for all categories
+#    - prompt_guard.threshold: Sets the default detection threshold (0.0-1.0) for all categories
+# 2. Category Overrides:
+#    - jailbreak_enabled: Override global enabled/disabled setting per category
+#    - jailbreak_threshold: Override global threshold per category
+# 3. Inheritance:
+#    - If jailbreak_enabled is not specified, inherits from prompt_guard.enabled
+#    - If jailbreak_threshold is not specified, inherits from prompt_guard.threshold
+# 4. Threshold Tuning:
+#    - Higher threshold (0.8-0.95): Stricter detection, fewer false positives, may miss subtle attacks
+#    - Lower threshold (0.5-0.7): More sensitive detection, catches more attacks, higher false positive rate
+#    - Recommended: Start with 0.7 globally, adjust per category based on risk profile
+# 5. Use Cases:
+#    - High-security categories (business, customer_support): Use higher thresholds (0.8-0.9)
+#    - Internal tools with code/technical content: Use lower thresholds (0.5-0.6) to reduce false positives
+#    - General categories: Use global default threshold
+# 6. Security Best Practices:
+#    - Enable jailbreak detection by default (prompt_guard.enabled: true)
+#    - Only disable or use very low thresholds for specific categories where the risk is managed differently
+#    - Consider the consequences of threshold settings on a per-category basis
+#    - Monitor false positive and false negative rates to tune thresholds appropriately
@@ -370,6 +370,12 @@ type Category struct {
 	// SemanticCacheSimilarityThreshold defines the minimum similarity score for cache hits (0.0-1.0)
 	// If nil, uses the global threshold from SemanticCache.SimilarityThreshold or BertModel.Threshold
 	SemanticCacheSimilarityThreshold *float32 `yaml:"semantic_cache_similarity_threshold,omitempty"`
+	// JailbreakEnabled controls whether jailbreak detection is enabled for this category
+	// If nil, inherits from global PromptGuard.Enabled setting
+	JailbreakEnabled *bool `yaml:"jailbreak_enabled,omitempty"`
+	// JailbreakThreshold defines the confidence threshold for jailbreak detection (0.0-1.0)
+	// If nil, uses the global threshold from PromptGuard.Threshold
+	JailbreakThreshold *float32 `yaml:"jailbreak_threshold,omitempty"`
 }
 
 // GetModelReasoningFamily returns the reasoning family configuration for a given model name
@@ -815,3 +821,25 @@ func (c *RouterConfig) GetCacheSimilarityThresholdForCategory(categoryName strin
 	// Fall back to global cache threshold or bert threshold
 	return c.GetCacheSimilarityThreshold()
 }
+
+// IsJailbreakEnabledForCategory returns whether jailbreak detection is enabled for a specific category
+// If the category has an explicit setting, it takes precedence; otherwise, uses global setting
+func (c *RouterConfig) IsJailbreakEnabledForCategory(categoryName string) bool {
+	category := c.GetCategoryByName(categoryName)
+	if category != nil && category.JailbreakEnabled != nil {
+		return *category.JailbreakEnabled
+	}
+	// Fall back to global setting
+	return c.PromptGuard.Enabled
+}
+
+// GetJailbreakThresholdForCategory returns the effective jailbreak detection threshold for a category
+// Priority: category-specific > global prompt_guard threshold
+func (c *RouterConfig) GetJailbreakThresholdForCategory(categoryName string) float32 {
+	category := c.GetCategoryByName(categoryName)
+	if category != nil && category.JailbreakThreshold != nil {
+		return *category.JailbreakThreshold
+	}
+	// Fall back to global threshold
+	return c.PromptGuard.Threshold
+}