diff --git a/README.md b/README.md index c55a936a..f856ed1b 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ Detect PII in the prompt, avoiding sending PII to the LLM so as to protect the p #### Prompt guard -Detect if the prompt is a jailbreak prompt, avoiding sending jailbreak prompts to the LLM so as to prevent the LLM from misbehaving. +Detect if the prompt is a jailbreak prompt, avoiding sending jailbreak prompts to the LLM so as to prevent the LLM from misbehaving. Can be configured globally or at the category level for fine-grained security control. ### Similarity Caching ⚡️ diff --git a/config/config.yaml b/config/config.yaml index 279feb67..06c1b60f 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -19,7 +19,7 @@ tools: fallback_to_empty: true prompt_guard: - enabled: true + enabled: true # Global default - can be overridden per category with jailbreak_enabled use_modernbert: true model_id: "models/jailbreak_classifier_modernbert-base_model" threshold: 0.7 @@ -62,6 +62,8 @@ classifier: categories: - name: business system_prompt: "You are a senior business consultant and strategic advisor with expertise in corporate strategy, operations management, financial analysis, marketing, and organizational development. Provide practical, actionable business advice backed by proven methodologies and industry best practices. Consider market dynamics, competitive landscape, and stakeholder interests in your recommendations." + # jailbreak_enabled: true # Optional: Override global jailbreak detection per category + # jailbreak_threshold: 0.8 # Optional: Override global jailbreak threshold per category model_scores: - model: qwen3 score: 0.7 diff --git a/config/examples/jailbreak_category_example.yaml b/config/examples/jailbreak_category_example.yaml new file mode 100644 index 00000000..52b84087 --- /dev/null +++ b/config/examples/jailbreak_category_example.yaml @@ -0,0 +1,126 @@ +# Category-Level Jailbreak Detection Example +# This example demonstrates how to configure jailbreak detection at the category level +# Different categories can have different jailbreak detection settings and thresholds based on their risk profiles + +# Global jailbreak detection configuration (can be overridden per category) +prompt_guard: + enabled: true # Global default - can be overridden per category + use_modernbert: true + model_id: "models/jailbreak_classifier_modernbert-base_model" + threshold: 0.7 # Global default threshold - can be overridden per category + use_cpu: true + jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json" + +# Classifier configuration +classifier: + category_model: + model_id: "models/category_classifier_modernbert-base_model" + use_modernbert: true + threshold: 0.6 + use_cpu: true + category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json" + +# Categories with different jailbreak detection settings +categories: + # High-security category: Strict jailbreak detection with high threshold + - name: business + description: "Business queries, strategy, and professional advice" + jailbreak_enabled: true # Explicitly enable (inherits from global by default) + jailbreak_threshold: 0.9 # Higher threshold for stricter detection + system_prompt: "You are a professional business consultant. Provide practical, actionable business advice." + model_scores: + - model: qwen3 + score: 0.7 + use_reasoning: false + + # Public-facing category: Enable with standard threshold + - name: customer_support + description: "Customer support and general inquiries" + jailbreak_enabled: true # Explicitly enable for customer-facing content + jailbreak_threshold: 0.8 # Slightly higher than global for public-facing + system_prompt: "You are a friendly customer support agent. Help users with their questions." + model_scores: + - model: qwen3 + score: 0.8 + use_reasoning: false + + # Internal tool category: Relaxed threshold (trusted environment) + - name: code_generation + description: "Internal code generation and development tools" + jailbreak_enabled: true # Keep enabled but with relaxed threshold + jailbreak_threshold: 0.5 # Lower threshold to reduce false positives for code + system_prompt: "You are a code generation assistant for internal developers." + model_scores: + - model: qwen3 + score: 0.9 + use_reasoning: true + + # Testing category: Disable jailbreak detection + - name: testing + description: "Testing and quality assurance queries" + jailbreak_enabled: false # Disable for testing purposes + system_prompt: "You are a QA assistant helping with test scenarios." + model_scores: + - model: qwen3 + score: 0.6 + use_reasoning: false + + # Default category: Uses global setting (inherits prompt_guard.enabled and threshold) + - name: general + description: "General queries that don't fit into specific categories" + # jailbreak_enabled not specified - will inherit from global prompt_guard.enabled + # jailbreak_threshold not specified - will inherit from global prompt_guard.threshold (0.7) + system_prompt: "You are a helpful assistant." + model_scores: + - model: qwen3 + score: 0.5 + use_reasoning: false + +# Model configuration +model_config: + "qwen3": + reasoning_family: "qwen3" + preferred_endpoints: ["endpoint1"] + pii_policy: + allow_by_default: true + +# Reasoning family configurations +reasoning_families: + qwen3: + type: "chat_template_kwargs" + parameter: "thinking" + +# Default model for fallback +default_model: qwen3 + +# vLLM endpoints configuration +vllm_endpoints: + - name: "endpoint1" + address: "127.0.0.1" + port: 8000 + weight: 1 + +# Usage Notes: +# ============= +# 1. Global Settings: +# - prompt_guard.enabled: Sets the default enabled/disabled for all categories +# - prompt_guard.threshold: Sets the default detection threshold (0.0-1.0) for all categories +# 2. Category Overrides: +# - jailbreak_enabled: Override global enabled/disabled setting per category +# - jailbreak_threshold: Override global threshold per category +# 3. Inheritance: +# - If jailbreak_enabled is not specified, inherits from prompt_guard.enabled +# - If jailbreak_threshold is not specified, inherits from prompt_guard.threshold +# 4. Threshold Tuning: +# - Higher threshold (0.8-0.95): Stricter detection, fewer false positives, may miss subtle attacks +# - Lower threshold (0.5-0.7): More sensitive detection, catches more attacks, higher false positive rate +# - Recommended: Start with 0.7 globally, adjust per category based on risk profile +# 5. Use Cases: +# - High-security categories (business, customer_support): Use higher thresholds (0.8-0.9) +# - Internal tools with code/technical content: Use lower thresholds (0.5-0.6) to reduce false positives +# - General categories: Use global default threshold +# 6. Security Best Practices: +# - Enable jailbreak detection by default (prompt_guard.enabled: true) +# - Only disable or use very low thresholds for specific categories where the risk is managed differently +# - Consider the consequences of threshold settings on a per-category basis +# - Monitor false positive and false negative rates to tune thresholds appropriately diff --git a/src/semantic-router/pkg/config/config.go b/src/semantic-router/pkg/config/config.go index 9766d473..8e5d34aa 100644 --- a/src/semantic-router/pkg/config/config.go +++ b/src/semantic-router/pkg/config/config.go @@ -370,6 +370,12 @@ type Category struct { // SemanticCacheSimilarityThreshold defines the minimum similarity score for cache hits (0.0-1.0) // If nil, uses the global threshold from SemanticCache.SimilarityThreshold or BertModel.Threshold SemanticCacheSimilarityThreshold *float32 `yaml:"semantic_cache_similarity_threshold,omitempty"` + // JailbreakEnabled controls whether jailbreak detection is enabled for this category + // If nil, inherits from global PromptGuard.Enabled setting + JailbreakEnabled *bool `yaml:"jailbreak_enabled,omitempty"` + // JailbreakThreshold defines the confidence threshold for jailbreak detection (0.0-1.0) + // If nil, uses the global threshold from PromptGuard.Threshold + JailbreakThreshold *float32 `yaml:"jailbreak_threshold,omitempty"` } // GetModelReasoningFamily returns the reasoning family configuration for a given model name @@ -815,3 +821,25 @@ func (c *RouterConfig) GetCacheSimilarityThresholdForCategory(categoryName strin // Fall back to global cache threshold or bert threshold return c.GetCacheSimilarityThreshold() } + +// IsJailbreakEnabledForCategory returns whether jailbreak detection is enabled for a specific category +// If the category has an explicit setting, it takes precedence; otherwise, uses global setting +func (c *RouterConfig) IsJailbreakEnabledForCategory(categoryName string) bool { + category := c.GetCategoryByName(categoryName) + if category != nil && category.JailbreakEnabled != nil { + return *category.JailbreakEnabled + } + // Fall back to global setting + return c.PromptGuard.Enabled +} + +// GetJailbreakThresholdForCategory returns the effective jailbreak detection threshold for a category +// Priority: category-specific > global prompt_guard threshold +func (c *RouterConfig) GetJailbreakThresholdForCategory(categoryName string) float32 { + category := c.GetCategoryByName(categoryName) + if category != nil && category.JailbreakThreshold != nil { + return *category.JailbreakThreshold + } + // Fall back to global threshold + return c.PromptGuard.Threshold +} diff --git a/src/semantic-router/pkg/config/config_test.go b/src/semantic-router/pkg/config/config_test.go index 8a34f399..ff027be3 100644 --- a/src/semantic-router/pkg/config/config_test.go +++ b/src/semantic-router/pkg/config/config_test.go @@ -1910,4 +1910,207 @@ categories: }) }) }) + + Describe("IsJailbreakEnabledForCategory", func() { + Context("when global jailbreak is enabled", func() { + It("should return true for category without explicit setting", func() { + category := config.Category{ + Name: "test", + ModelScores: []config.ModelScore{{Model: "test", Score: 1.0}}, + } + + cfg := &config.RouterConfig{ + PromptGuard: config.PromptGuardConfig{ + Enabled: true, + }, + Categories: []config.Category{category}, + } + + Expect(cfg.IsJailbreakEnabledForCategory("test")).To(BeTrue()) + }) + + It("should return false when category explicitly disables jailbreak", func() { + category := config.Category{ + Name: "test", + JailbreakEnabled: config.BoolPtr(false), + ModelScores: []config.ModelScore{{Model: "test", Score: 1.0}}, + } + + cfg := &config.RouterConfig{ + PromptGuard: config.PromptGuardConfig{ + Enabled: true, + }, + Categories: []config.Category{category}, + } + + Expect(cfg.IsJailbreakEnabledForCategory("test")).To(BeFalse()) + }) + + It("should return true when category explicitly enables jailbreak", func() { + category := config.Category{ + Name: "test", + JailbreakEnabled: config.BoolPtr(true), + ModelScores: []config.ModelScore{{Model: "test", Score: 1.0}}, + } + + cfg := &config.RouterConfig{ + PromptGuard: config.PromptGuardConfig{ + Enabled: true, + }, + Categories: []config.Category{category}, + } + + Expect(cfg.IsJailbreakEnabledForCategory("test")).To(BeTrue()) + }) + }) + + Context("when global jailbreak is disabled", func() { + It("should return false for category without explicit setting", func() { + category := config.Category{ + Name: "test", + ModelScores: []config.ModelScore{{Model: "test", Score: 1.0}}, + } + + cfg := &config.RouterConfig{ + PromptGuard: config.PromptGuardConfig{ + Enabled: false, + }, + Categories: []config.Category{category}, + } + + Expect(cfg.IsJailbreakEnabledForCategory("test")).To(BeFalse()) + }) + + It("should return true when category explicitly enables jailbreak", func() { + category := config.Category{ + Name: "test", + JailbreakEnabled: config.BoolPtr(true), + ModelScores: []config.ModelScore{{Model: "test", Score: 1.0}}, + } + + cfg := &config.RouterConfig{ + PromptGuard: config.PromptGuardConfig{ + Enabled: false, + }, + Categories: []config.Category{category}, + } + + Expect(cfg.IsJailbreakEnabledForCategory("test")).To(BeTrue()) + }) + + It("should return false when category explicitly disables jailbreak", func() { + category := config.Category{ + Name: "test", + JailbreakEnabled: config.BoolPtr(false), + ModelScores: []config.ModelScore{{Model: "test", Score: 1.0}}, + } + + cfg := &config.RouterConfig{ + PromptGuard: config.PromptGuardConfig{ + Enabled: false, + }, + Categories: []config.Category{category}, + } + + Expect(cfg.IsJailbreakEnabledForCategory("test")).To(BeFalse()) + }) + }) + + Context("when category does not exist", func() { + It("should fall back to global setting", func() { + cfg := &config.RouterConfig{ + PromptGuard: config.PromptGuardConfig{ + Enabled: true, + }, + Categories: []config.Category{}, + } + + Expect(cfg.IsJailbreakEnabledForCategory("nonexistent")).To(BeTrue()) + }) + }) + }) + + Describe("GetJailbreakThresholdForCategory", func() { + Context("when global threshold is set", func() { + It("should return global threshold for category without explicit setting", func() { + category := config.Category{ + Name: "test", + ModelScores: []config.ModelScore{{Model: "test", Score: 1.0}}, + } + + cfg := &config.RouterConfig{ + PromptGuard: config.PromptGuardConfig{ + Threshold: 0.7, + }, + Categories: []config.Category{category}, + } + + Expect(cfg.GetJailbreakThresholdForCategory("test")).To(Equal(float32(0.7))) + }) + + It("should return category-specific threshold when set", func() { + category := config.Category{ + Name: "test", + JailbreakThreshold: config.Float32Ptr(0.9), + ModelScores: []config.ModelScore{{Model: "test", Score: 1.0}}, + } + + cfg := &config.RouterConfig{ + PromptGuard: config.PromptGuardConfig{ + Threshold: 0.7, + }, + Categories: []config.Category{category}, + } + + Expect(cfg.GetJailbreakThresholdForCategory("test")).To(Equal(float32(0.9))) + }) + + It("should allow lower threshold override", func() { + category := config.Category{ + Name: "test", + JailbreakThreshold: config.Float32Ptr(0.5), + ModelScores: []config.ModelScore{{Model: "test", Score: 1.0}}, + } + + cfg := &config.RouterConfig{ + PromptGuard: config.PromptGuardConfig{ + Threshold: 0.7, + }, + Categories: []config.Category{category}, + } + + Expect(cfg.GetJailbreakThresholdForCategory("test")).To(Equal(float32(0.5))) + }) + + It("should allow higher threshold override", func() { + category := config.Category{ + Name: "test", + JailbreakThreshold: config.Float32Ptr(0.95), + ModelScores: []config.ModelScore{{Model: "test", Score: 1.0}}, + } + + cfg := &config.RouterConfig{ + PromptGuard: config.PromptGuardConfig{ + Threshold: 0.7, + }, + Categories: []config.Category{category}, + } + + Expect(cfg.GetJailbreakThresholdForCategory("test")).To(Equal(float32(0.95))) + }) + }) + + Context("when category does not exist", func() { + It("should fall back to global threshold", func() { + cfg := &config.RouterConfig{ + PromptGuard: config.PromptGuardConfig{ + Threshold: 0.8, + }, + Categories: []config.Category{}, + } + + Expect(cfg.GetJailbreakThresholdForCategory("nonexistent")).To(Equal(float32(0.8))) + }) + }) + }) }) diff --git a/src/semantic-router/pkg/extproc/request_handler.go b/src/semantic-router/pkg/extproc/request_handler.go index e90f4745..d2482f93 100644 --- a/src/semantic-router/pkg/extproc/request_handler.go +++ b/src/semantic-router/pkg/extproc/request_handler.go @@ -396,12 +396,7 @@ func (r *OpenAIRouter) handleRequestBody(v *ext_proc.ProcessingRequest_RequestBo // Get content from messages userContent, nonUserMessages := extractUserAndNonUserContent(openAIRequest) - // Perform security checks - if response, shouldReturn := r.performSecurityChecks(ctx, userContent, nonUserMessages); shouldReturn { - return response, nil - } - - // Classify the request early to determine category for cache settings + // Classify the request early to determine category for security checks and cache settings var categoryName string if r.Config != nil && r.Config.IsAutoModelName(originalModel) && (len(nonUserMessages) > 0 || userContent != "") { // Determine text to use for classification @@ -417,6 +412,11 @@ func (r *OpenAIRouter) handleRequestBody(v *ext_proc.ProcessingRequest_RequestBo } } + // Perform security checks with category-specific settings + if response, shouldReturn := r.performSecurityChecks(ctx, userContent, nonUserMessages, categoryName); shouldReturn { + return response, nil + } + // Handle caching with category-specific settings if response, shouldReturn := r.handleCaching(ctx, categoryName); shouldReturn { return response, nil @@ -426,19 +426,32 @@ func (r *OpenAIRouter) handleRequestBody(v *ext_proc.ProcessingRequest_RequestBo return r.handleModelRouting(openAIRequest, originalModel, userContent, nonUserMessages, ctx) } -// performSecurityChecks performs PII and jailbreak detection -func (r *OpenAIRouter) performSecurityChecks(ctx *RequestContext, userContent string, nonUserMessages []string) (*ext_proc.ProcessingResponse, bool) { +// performSecurityChecks performs PII and jailbreak detection with category-specific settings +func (r *OpenAIRouter) performSecurityChecks(ctx *RequestContext, userContent string, nonUserMessages []string, categoryName string) (*ext_proc.ProcessingResponse, bool) { // Perform PII classification on all message content allContent := pii.ExtractAllContent(userContent, nonUserMessages) + // Check if jailbreak detection is enabled for this category + jailbreakEnabled := r.Classifier.IsJailbreakEnabled() + if categoryName != "" && r.Config != nil { + // Use category-specific setting if available + jailbreakEnabled = jailbreakEnabled && r.Config.IsJailbreakEnabledForCategory(categoryName) + } + + // Get category-specific threshold + jailbreakThreshold := r.Config.PromptGuard.Threshold + if categoryName != "" && r.Config != nil { + jailbreakThreshold = r.Config.GetJailbreakThresholdForCategory(categoryName) + } + // Perform jailbreak detection on all message content - if r.Classifier.IsJailbreakEnabled() { + if jailbreakEnabled { // Start jailbreak detection span spanCtx, span := observability.StartSpan(ctx.TraceContext, observability.SpanJailbreakDetection) defer span.End() startTime := time.Now() - hasJailbreak, jailbreakDetections, err := r.Classifier.AnalyzeContentForJailbreak(allContent) + hasJailbreak, jailbreakDetections, err := r.Classifier.AnalyzeContentForJailbreakWithThreshold(allContent, jailbreakThreshold) detectionTime := time.Since(startTime).Milliseconds() observability.SetSpanAttributes(span, diff --git a/src/semantic-router/pkg/utils/classification/classifier.go b/src/semantic-router/pkg/utils/classification/classifier.go index ac5e5c0e..3dc820f1 100644 --- a/src/semantic-router/pkg/utils/classification/classifier.go +++ b/src/semantic-router/pkg/utils/classification/classifier.go @@ -425,6 +425,11 @@ func (c *Classifier) initializeJailbreakClassifier() error { // CheckForJailbreak analyzes the given text for jailbreak attempts func (c *Classifier) CheckForJailbreak(text string) (bool, string, float32, error) { + return c.CheckForJailbreakWithThreshold(text, c.Config.PromptGuard.Threshold) +} + +// CheckForJailbreakWithThreshold analyzes the given text for jailbreak attempts with a custom threshold +func (c *Classifier) CheckForJailbreakWithThreshold(text string, threshold float32) (bool, string, float32, error) { if !c.IsJailbreakEnabled() { return false, "", 0.0, fmt.Errorf("jailbreak detection is not enabled or properly configured") } @@ -453,14 +458,14 @@ func (c *Classifier) CheckForJailbreak(text string) (bool, string, float32, erro } // Check if confidence meets threshold and indicates jailbreak - isJailbreak := result.Confidence >= c.Config.PromptGuard.Threshold && jailbreakType == "jailbreak" + isJailbreak := result.Confidence >= threshold && jailbreakType == "jailbreak" if isJailbreak { observability.Warnf("JAILBREAK DETECTED: '%s' (confidence: %.3f, threshold: %.3f)", - jailbreakType, result.Confidence, c.Config.PromptGuard.Threshold) + jailbreakType, result.Confidence, threshold) } else { observability.Infof("BENIGN: '%s' (confidence: %.3f, threshold: %.3f)", - jailbreakType, result.Confidence, c.Config.PromptGuard.Threshold) + jailbreakType, result.Confidence, threshold) } return isJailbreak, jailbreakType, result.Confidence, nil @@ -468,6 +473,11 @@ func (c *Classifier) CheckForJailbreak(text string) (bool, string, float32, erro // AnalyzeContentForJailbreak analyzes multiple content pieces for jailbreak attempts func (c *Classifier) AnalyzeContentForJailbreak(contentList []string) (bool, []JailbreakDetection, error) { + return c.AnalyzeContentForJailbreakWithThreshold(contentList, c.Config.PromptGuard.Threshold) +} + +// AnalyzeContentForJailbreakWithThreshold analyzes multiple content pieces for jailbreak attempts with a custom threshold +func (c *Classifier) AnalyzeContentForJailbreakWithThreshold(contentList []string, threshold float32) (bool, []JailbreakDetection, error) { if !c.IsJailbreakEnabled() { return false, nil, fmt.Errorf("jailbreak detection is not enabled or properly configured") } @@ -480,7 +490,7 @@ func (c *Classifier) AnalyzeContentForJailbreak(contentList []string) (bool, []J continue } - isJailbreak, jailbreakType, confidence, err := c.CheckForJailbreak(content) + isJailbreak, jailbreakType, confidence, err := c.CheckForJailbreakWithThreshold(content, threshold) if err != nil { observability.Errorf("Error analyzing content %d: %v", i, err) continue diff --git a/src/training/training_lora/classifier_model_fine_tuning_lora/go.mod b/src/training/training_lora/classifier_model_fine_tuning_lora/go.mod index 338e6383..d71496c7 100644 --- a/src/training/training_lora/classifier_model_fine_tuning_lora/go.mod +++ b/src/training/training_lora/classifier_model_fine_tuning_lora/go.mod @@ -4,4 +4,4 @@ go 1.24.1 replace github.com/vllm-project/semantic-router/candle-binding => ../../../../candle-binding -require github.com/vllm-project/semantic-router/candle-binding v0.0.0-00010101000000-000000000000 +require github.com/vllm-project/semantic-router/candle-binding v0.0.0-00010101000000-000000000000 diff --git a/src/training/training_lora/pii_model_fine_tuning_lora/go.mod b/src/training/training_lora/pii_model_fine_tuning_lora/go.mod index 99bdf152..c6f84bae 100644 --- a/src/training/training_lora/pii_model_fine_tuning_lora/go.mod +++ b/src/training/training_lora/pii_model_fine_tuning_lora/go.mod @@ -4,4 +4,4 @@ go 1.24.1 replace github.com/vllm-project/semantic-router/candle-binding => ../../../../candle-binding -require github.com/vllm-project/semantic-router/candle-binding v0.0.0-00010101000000-000000000000 +require github.com/vllm-project/semantic-router/candle-binding v0.0.0-00010101000000-000000000000 diff --git a/src/training/training_lora/prompt_guard_fine_tuning_lora/go.mod b/src/training/training_lora/prompt_guard_fine_tuning_lora/go.mod index 6195d9f1..869a3f41 100644 --- a/src/training/training_lora/prompt_guard_fine_tuning_lora/go.mod +++ b/src/training/training_lora/prompt_guard_fine_tuning_lora/go.mod @@ -4,4 +4,4 @@ go 1.24.1 replace github.com/vllm-project/semantic-router/candle-binding => ../../../../candle-binding -require github.com/vllm-project/semantic-router/candle-binding v0.0.0-00010101000000-000000000000 +require github.com/vllm-project/semantic-router/candle-binding v0.0.0-00010101000000-000000000000 diff --git a/website/docs/installation/configuration.md b/website/docs/installation/configuration.md index 37d0f8e0..18742ee3 100644 --- a/website/docs/installation/configuration.md +++ b/website/docs/installation/configuration.md @@ -38,7 +38,7 @@ tools: # Jailbreak protection prompt_guard: - enabled: false + enabled: false # Global default - can be overridden per category use_modernbert: true model_id: "models/jailbreak_classifier_modernbert-base_model" threshold: 0.7 @@ -84,6 +84,8 @@ categories: # Optional: Category-level cache settings # semantic_cache_enabled: true # semantic_cache_similarity_threshold: 0.9 # Higher threshold for math + # Optional: Category-level jailbreak settings + # jailbreak_enabled: true # Override global jailbreak detection - name: computer science model_scores: - model: your-model diff --git a/website/docs/overview/categories/configuration.md b/website/docs/overview/categories/configuration.md index 7bc776d0..9a274ec9 100644 --- a/website/docs/overview/categories/configuration.md +++ b/website/docs/overview/categories/configuration.md @@ -83,6 +83,70 @@ curl -X PUT http://localhost:8080/config/system-prompts \ ### Reasoning Configuration +#### `jailbreak_enabled` (Optional) + +- **Type**: Boolean +- **Description**: Whether to enable jailbreak detection for this category +- **Default**: Inherits from global `prompt_guard.enabled` setting +- **Impact**: Enables or disables jailbreak protection for this specific category + +```yaml +categories: + - name: customer_support + jailbreak_enabled: true # Explicitly enable for public-facing + model_scores: + - model: qwen3 + score: 0.8 + + - name: code_generation + jailbreak_enabled: false # Disable for internal tools + model_scores: + - model: qwen3 + score: 0.9 + + - name: general + # No jailbreak_enabled - inherits from global prompt_guard.enabled + model_scores: + - model: qwen3 + score: 0.5 +``` + +#### `jailbreak_threshold` (Optional) + +- **Type**: Float (0.0-1.0) +- **Description**: Confidence threshold for jailbreak detection +- **Default**: Inherits from global `prompt_guard.threshold` setting +- **Impact**: Controls sensitivity of jailbreak detection for this category +- **Tuning**: Higher values = stricter (fewer false positives), Lower values = more sensitive (catches more attacks) + +```yaml +categories: + - name: customer_support + jailbreak_enabled: true + jailbreak_threshold: 0.9 # Strict detection for public-facing + model_scores: + - model: qwen3 + score: 0.8 + + - name: code_generation + jailbreak_enabled: true + jailbreak_threshold: 0.5 # Relaxed to reduce false positives on code + model_scores: + - model: qwen3 + score: 0.9 + + - name: general + # No jailbreak_threshold - inherits from global prompt_guard.threshold + model_scores: + - model: qwen3 + score: 0.5 +``` + +**Threshold Guidelines**: +- **0.8-0.95**: High-security categories (customer support, business) +- **0.6-0.8**: Standard categories (general queries) +- **0.4-0.6**: Technical categories (code generation, development tools) + #### `use_reasoning` (Required) - **Type**: Boolean @@ -196,7 +260,48 @@ categories: score: 0.2 ``` -### Example 3: Multi-Category Configuration +### Example 3: Security-Focused Configuration (Jailbreak Protection) + +```yaml +categories: + # High-security public-facing category with strict threshold + - name: "customer_support" + description: "Customer support and general inquiries" + jailbreak_enabled: true # Strict jailbreak protection + jailbreak_threshold: 0.9 # High threshold for public-facing + use_reasoning: false + model_scores: + - model: "phi4" + score: 0.9 + - model: "mistral-small3.1" + score: 0.7 + + # Technical category with relaxed threshold + - name: "code_generation" + description: "Code generation for developers" + jailbreak_enabled: true # Keep enabled + jailbreak_threshold: 0.5 # Lower threshold to reduce false positives on code + use_reasoning: true + reasoning_effort: "medium" + model_scores: + - model: "gemma3:27b" + score: 0.9 + - model: "phi4" + score: 0.7 + + # General category using global default + - name: "general" + description: "General queries" + # jailbreak_enabled not specified - inherits from global prompt_guard.enabled + use_reasoning: false + model_scores: + - model: "phi4" + score: 0.6 + - model: "mistral-small3.1" + score: 0.6 +``` + +### Example 4: Multi-Category Configuration ```yaml categories: diff --git a/website/docs/tutorials/content-safety/jailbreak-protection.md b/website/docs/tutorials/content-safety/jailbreak-protection.md index 6f3ac801..60774e60 100644 --- a/website/docs/tutorials/content-safety/jailbreak-protection.md +++ b/website/docs/tutorials/content-safety/jailbreak-protection.md @@ -43,7 +43,7 @@ Enable jailbreak detection in your configuration: ```yaml # config/config.yaml prompt_guard: - enabled: true + enabled: true # Global default - can be overridden per category model_id: "models/jailbreak_classifier_modernbert-base_model" threshold: 0.7 # Detection sensitivity (0.0-1.0) use_cpu: true # Run on CPU @@ -51,6 +51,65 @@ prompt_guard: jailbreak_mapping_path: "config/jailbreak_type_mapping.json" # Path to jailbreak type mapping ``` +### Category-Level Jailbreak Protection + +You can configure jailbreak detection at the category level for fine-grained security control, including both enabling/disabling and threshold customization: + +```yaml +# Global default settings +prompt_guard: + enabled: true # Default for all categories + threshold: 0.7 # Default threshold for all categories + +categories: + # High-security category - strict protection with high threshold + - name: customer_support + jailbreak_enabled: true # Strict protection for public-facing + jailbreak_threshold: 0.9 # Higher threshold for stricter detection + model_scores: + - model: qwen3 + score: 0.8 + + # Internal tool - relaxed threshold for code/technical content + - name: code_generation + jailbreak_enabled: true # Keep enabled but with relaxed threshold + jailbreak_threshold: 0.5 # Lower threshold to reduce false positives + model_scores: + - model: qwen3 + score: 0.9 + + # General category - inherits global settings + - name: general + # No jailbreak_enabled or jailbreak_threshold specified + # Uses global prompt_guard.enabled (true) and threshold (0.7) + model_scores: + - model: qwen3 + score: 0.5 +``` + +**Category-Level Behavior**: + +- **When `jailbreak_enabled` is not specified**: Category inherits from global `prompt_guard.enabled` +- **When `jailbreak_enabled: true`**: Jailbreak detection is explicitly enabled for this category +- **When `jailbreak_enabled: false`**: Jailbreak detection is explicitly disabled for this category +- **When `jailbreak_threshold` is not specified**: Category inherits from global `prompt_guard.threshold` +- **When `jailbreak_threshold: 0.X`**: Uses category-specific threshold (0.0-1.0) +- **Category-specific settings always override global settings** when explicitly configured + +**Threshold Tuning Guide**: + +- **High threshold (0.8-0.95)**: Stricter detection, fewer false positives, may miss subtle attacks +- **Medium threshold (0.6-0.8)**: Balanced detection, good for most use cases +- **Low threshold (0.4-0.6)**: More sensitive, catches more attacks, higher false positive rate +- **Recommended**: Start with 0.7 globally, adjust per category based on risk profile and false positive tolerance + +**Use Cases**: + +- **High-security categories (0.8-0.9 threshold)**: Customer support, business advice, public-facing APIs +- **Technical categories (0.5-0.6 threshold)**: Code generation, developer tools (reduce false positives on technical jargon) +- **Internal tools (0.5 threshold or disabled)**: Testing environments, trusted internal applications +- **General categories (inherit global)**: Use global default for most categories + ## How Jailbreak Protection Works The jailbreak protection system works as follows: @@ -134,9 +193,38 @@ security_policy_violations_total 45 ### 4. Integration with Routing - Apply stricter protection to sensitive models -- Use different thresholds for different categories +- Use category-level jailbreak settings for different domains - Combine with PII detection for comprehensive security +**Example**: Configure different jailbreak policies per category: + +```yaml +prompt_guard: + enabled: true # Global default + +categories: + # Strict protection for customer-facing categories + - name: customer_support + jailbreak_enabled: true + model_scores: + - model: safe-model + score: 0.9 + + # Relaxed protection for internal development + - name: code_generation + jailbreak_enabled: false # Allow broader input + model_scores: + - model: code-model + score: 0.9 + + # Use global default for general queries + - name: general + # Inherits from prompt_guard.enabled + model_scores: + - model: general-model + score: 0.7 +``` + ## Troubleshooting ### High False Positives