|
| 1 | +# Category-Level Jailbreak Detection Example |
| 2 | +# This example demonstrates how to configure jailbreak detection at the category level |
| 3 | +# Different categories can have different jailbreak detection settings and thresholds based on their risk profiles |
| 4 | + |
| 5 | +# Global jailbreak detection configuration (can be overridden per category) |
| 6 | +prompt_guard: |
| 7 | + enabled: true # Global default - can be overridden per category |
| 8 | + use_modernbert: true |
| 9 | + model_id: "models/jailbreak_classifier_modernbert-base_model" |
| 10 | + threshold: 0.7 # Global default threshold - can be overridden per category |
| 11 | + use_cpu: true |
| 12 | + jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json" |
| 13 | + |
| 14 | +# Classifier configuration |
| 15 | +classifier: |
| 16 | + category_model: |
| 17 | + model_id: "models/category_classifier_modernbert-base_model" |
| 18 | + use_modernbert: true |
| 19 | + threshold: 0.6 |
| 20 | + use_cpu: true |
| 21 | + category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json" |
| 22 | + |
| 23 | +# Categories with different jailbreak detection settings |
| 24 | +categories: |
| 25 | + # High-security category: Strict jailbreak detection with high threshold |
| 26 | + - name: business |
| 27 | + description: "Business queries, strategy, and professional advice" |
| 28 | + jailbreak_enabled: true # Explicitly enable (inherits from global by default) |
| 29 | + jailbreak_threshold: 0.9 # Higher threshold for stricter detection |
| 30 | + system_prompt: "You are a professional business consultant. Provide practical, actionable business advice." |
| 31 | + model_scores: |
| 32 | + - model: qwen3 |
| 33 | + score: 0.7 |
| 34 | + use_reasoning: false |
| 35 | + |
| 36 | + # Public-facing category: Enable with standard threshold |
| 37 | + - name: customer_support |
| 38 | + description: "Customer support and general inquiries" |
| 39 | + jailbreak_enabled: true # Explicitly enable for customer-facing content |
| 40 | + jailbreak_threshold: 0.8 # Slightly higher than global for public-facing |
| 41 | + system_prompt: "You are a friendly customer support agent. Help users with their questions." |
| 42 | + model_scores: |
| 43 | + - model: qwen3 |
| 44 | + score: 0.8 |
| 45 | + use_reasoning: false |
| 46 | + |
| 47 | + # Internal tool category: Relaxed threshold (trusted environment) |
| 48 | + - name: code_generation |
| 49 | + description: "Internal code generation and development tools" |
| 50 | + jailbreak_enabled: true # Keep enabled but with relaxed threshold |
| 51 | + jailbreak_threshold: 0.5 # Lower threshold to reduce false positives for code |
| 52 | + system_prompt: "You are a code generation assistant for internal developers." |
| 53 | + model_scores: |
| 54 | + - model: qwen3 |
| 55 | + score: 0.9 |
| 56 | + use_reasoning: true |
| 57 | + |
| 58 | + # Testing category: Disable jailbreak detection |
| 59 | + - name: testing |
| 60 | + description: "Testing and quality assurance queries" |
| 61 | + jailbreak_enabled: false # Disable for testing purposes |
| 62 | + system_prompt: "You are a QA assistant helping with test scenarios." |
| 63 | + model_scores: |
| 64 | + - model: qwen3 |
| 65 | + score: 0.6 |
| 66 | + use_reasoning: false |
| 67 | + |
| 68 | + # Default category: Uses global setting (inherits prompt_guard.enabled and threshold) |
| 69 | + - name: general |
| 70 | + description: "General queries that don't fit into specific categories" |
| 71 | + # jailbreak_enabled not specified - will inherit from global prompt_guard.enabled |
| 72 | + # jailbreak_threshold not specified - will inherit from global prompt_guard.threshold (0.7) |
| 73 | + system_prompt: "You are a helpful assistant." |
| 74 | + model_scores: |
| 75 | + - model: qwen3 |
| 76 | + score: 0.5 |
| 77 | + use_reasoning: false |
| 78 | + |
| 79 | +# Model configuration |
| 80 | +model_config: |
| 81 | + "qwen3": |
| 82 | + reasoning_family: "qwen3" |
| 83 | + preferred_endpoints: ["endpoint1"] |
| 84 | + pii_policy: |
| 85 | + allow_by_default: true |
| 86 | + |
| 87 | +# Reasoning family configurations |
| 88 | +reasoning_families: |
| 89 | + qwen3: |
| 90 | + type: "chat_template_kwargs" |
| 91 | + parameter: "thinking" |
| 92 | + |
| 93 | +# Default model for fallback |
| 94 | +default_model: qwen3 |
| 95 | + |
| 96 | +# vLLM endpoints configuration |
| 97 | +vllm_endpoints: |
| 98 | + - name: "endpoint1" |
| 99 | + address: "127.0.0.1" |
| 100 | + port: 8000 |
| 101 | + weight: 1 |
| 102 | + |
| 103 | +# Usage Notes: |
| 104 | +# ============= |
| 105 | +# 1. Global Settings: |
| 106 | +# - prompt_guard.enabled: Sets the default enabled/disabled for all categories |
| 107 | +# - prompt_guard.threshold: Sets the default detection threshold (0.0-1.0) for all categories |
| 108 | +# 2. Category Overrides: |
| 109 | +# - jailbreak_enabled: Override global enabled/disabled setting per category |
| 110 | +# - jailbreak_threshold: Override global threshold per category |
| 111 | +# 3. Inheritance: |
| 112 | +# - If jailbreak_enabled is not specified, inherits from prompt_guard.enabled |
| 113 | +# - If jailbreak_threshold is not specified, inherits from prompt_guard.threshold |
| 114 | +# 4. Threshold Tuning: |
| 115 | +# - Higher threshold (0.8-0.95): Stricter detection, fewer false positives, may miss subtle attacks |
| 116 | +# - Lower threshold (0.5-0.7): More sensitive detection, catches more attacks, higher false positive rate |
| 117 | +# - Recommended: Start with 0.7 globally, adjust per category based on risk profile |
| 118 | +# 5. Use Cases: |
| 119 | +# - High-security categories (business, customer_support): Use higher thresholds (0.8-0.9) |
| 120 | +# - Internal tools with code/technical content: Use lower thresholds (0.5-0.6) to reduce false positives |
| 121 | +# - General categories: Use global default threshold |
| 122 | +# 6. Security Best Practices: |
| 123 | +# - Enable jailbreak detection by default (prompt_guard.enabled: true) |
| 124 | +# - Only disable or use very low thresholds for specific categories where the risk is managed differently |
| 125 | +# - Consider the consequences of threshold settings on a per-category basis |
| 126 | +# - Monitor false positive and false negative rates to tune thresholds appropriately |
0 commit comments