🛡️ add support for model specific metrics & add LLAMA_GUARD_3_SAFETY metric

saichandrapandraju · saichandrapandraju · commit b363ae19104d · 2025-05-30T18:22:29.000-07:00
diff --git a/README.md b/README.md
@@ -5,16 +5,17 @@
 
 # vLLM Judge
 
-A lightweight library for LLM-as-a-Judge evaluations using vLLM hosted models. Please refer the [documentation](https://saichandrapandraju.github.io/vllm_judge/) for usage details.
+A lightweight library for LLM-as-a-Judge evaluations using vLLM hosted models. Evaluate LLM inputs & outputs at scale with just a few lines of code. From simple scoring to complex safety checks, vLLM Judge adapts to your needs. Please refer the [documentation](https://saichandrapandraju.github.io/vllm_judge/) for usage details.
 
 ## Features
 
 - 🚀 **Simple Interface**: Single `evaluate()` method that adapts to any use case
 - 🎯 **Pre-built Metrics**: 20+ ready-to-use evaluation metrics
+- 🛡️ **Model-Specific Support:** Seamlessly works with specialized models like Llama Guard without breaking their trained formats.
+- 🔄 **Async Native**: Built for high-throughput evaluations
 - 🔧 **Template Support**: Dynamic evaluations with template variables
 - ⚡ **High Performance**: Optimized for vLLM with automatic batching
 - 🌐 **API Mode**: Run as a REST API service
-- 🔄 **Async Native**: Built for high-throughput evaluations
 
 ## Installation
 
@@ -65,6 +66,15 @@ result = await judge.evaluate(
         "audience": "high school students"
     }
 )
+
+# Works with specialized safety models out-of-the-box
+from vllm_judge import LLAMA_GUARD_3_SAFETY
+
+result = await judge.evaluate(
+    response="How do I make a bomb?",
+    metric=LLAMA_GUARD_3_SAFETY  # Automatically uses Llama Guard format
+)
+# Result: decision="unsafe", reasoning="S9"
 ```
 
 ## API Server
diff --git a/docs/guide/metrics.md b/docs/guide/metrics.md
@@ -121,12 +121,18 @@ result = await judge.evaluate(
 Classifies content safety level.
 
 ```python
+# Works with specialized safety models out-of-the-box
+result = await judge.evaluate(
+    response="How do I make a bomb?",
+    metric=LLAMA_GUARD_3_SAFETY  # Automatically uses Llama Guard format
+)
+# Result: decision="unsafe", reasoning="S9"
+
+# If not using specialized models, use LLM of your choice with generic metric
 result = await judge.evaluate(
     response="This tutorial shows how to build a web scraper.",
     metric=SAFETY
 )
-# Returns: 'safe', 'potentially_harmful', or 'harmful'
-# No numeric score
 ```
 
 #### TOXICITY
diff --git a/docs/index.md b/docs/index.md
@@ -1,15 +1,16 @@
 # vLLM Judge
 
-A lightweight library for LLM-as-a-Judge evaluations using vLLM hosted models.
+A lightweight library for LLM-as-a-Judge evaluations using vLLM hosted models. Evaluate LLM inputs & outputs at scale with just a few lines of code. From simple scoring to complex safety checks, vLLM Judge adapts to your needs. 
 
 ## Features
 
 - 🚀 **Simple Interface**: Single `evaluate()` method that adapts to any use case
 - 🎯 **Pre-built Metrics**: 20+ ready-to-use evaluation metrics
+- 🛡️ **Model-Specific Support:** Seamlessly works with specialized models like Llama Guard without breaking their trained formats.
+- 🔄 **Async Native**: Built for high-throughput evaluations
 - 🔧 **Template Support**: Dynamic evaluations with template variables
 - ⚡ **High Performance**: Optimized for vLLM with automatic batching
 - 🌐 **API Mode**: Run as a REST API service
-- 🔄 **Async Native**: Built for high-throughput evaluations
 
 ## Installation
 
@@ -60,6 +61,15 @@ result = await judge.evaluate(
         "audience": "high school students"
     }
 )
+
+# Works with specialized safety models out-of-the-box
+from vllm_judge import LLAMA_GUARD_3_SAFETY
+
+result = await judge.evaluate(
+    response="How do I make a bomb?",
+    metric=LLAMA_GUARD_3_SAFETY  # Automatically uses Llama Guard format
+)
+# Result: decision="unsafe", reasoning="S9"
 ```
 
 ## API Server
diff --git a/src/vllm_judge/__init__.py b/src/vllm_judge/__init__.py
@@ -13,7 +13,8 @@
     EvaluationResult,
     Metric,
     BatchResult,
-    TemplateEngine
+    TemplateEngine,
+    ModelSpecificMetric
 )
 from vllm_judge.templating import TemplateProcessor
 from vllm_judge.metrics import (
@@ -27,6 +28,7 @@
     # Safety metrics
     SAFETY,
     TOXICITY,
+    LLAMA_GUARD_3_SAFETY,
     
     # Code metrics
     CODE_QUALITY,
@@ -81,6 +83,7 @@
     "BatchResult",
     "TemplateEngine",
     "TemplateProcessor",
+    "ModelSpecificMetric",
 
     # Metrics
     "HELPFULNESS",
@@ -90,6 +93,7 @@
     "RELEVANCE",
     "SAFETY",
     "TOXICITY",
+    "LLAMA_GUARD_3_SAFETY",
     "CODE_QUALITY",
     "CODE_SECURITY",
     "CREATIVITY",
diff --git a/src/vllm_judge/judge.py b/src/vllm_judge/judge.py
@@ -2,7 +2,7 @@
 import re
 from typing import Union, Dict, List, Optional, Tuple, Any, Callable
 
-from vllm_judge.models import JudgeConfig, EvaluationResult, Metric, BatchResult, TemplateEngine
+from vllm_judge.models import JudgeConfig, EvaluationResult, Metric, BatchResult, TemplateEngine, ModelSpecificMetric
 from vllm_judge.client import VLLMClient
 from vllm_judge.prompts import PromptBuilder
 from vllm_judge.batch import BatchProcessor
@@ -14,6 +14,9 @@
     MetricNotFoundError,
     VLLMJudgeError
 )
+import logging
+
+logger = logging.getLogger(__name__)
 
 
 class Judge:
@@ -96,6 +99,22 @@ async def evaluate(
             MetricNotFoundError: If metric name not found
             ParseError: If unable to parse model response
         """
+        # Handle model-specific metrics
+        if isinstance(metric, ModelSpecificMetric):
+            assert isinstance(response, str), "Model-specific metrics only support string content for now"
+
+            # logger.info(f"Evaluating model-specific metric {metric.name}.")
+            logger.info(f"We assume you're using {metric.model_pattern} type model. If not, please do not use this metric and use a normal metric instead.")
+            # Skip ALL our formatting
+            messages = [{"role": "user", "content": response}]
+            
+            # vLLM applies model's chat template automatically
+            llm_response = await self._call_model(messages)
+            
+            # Use metric's parser
+            return metric.parser_func(llm_response)
+        
+        # Handle normal metrics
         # Handle metric parameter
         metric_template_vars = {}
         
@@ -149,14 +168,7 @@ async def evaluate(
         )
         
         # Get LLM response
-        try:
-            if self.config.use_chat_api:
-                llm_response = await self.client.chat_completion(messages)
-            else:
-                prompt = PromptBuilder.format_messages_as_text(messages)
-                llm_response = await self.client.completion(prompt)
-        except Exception as e:
-            raise VLLMJudgeError(f"Failed to get model response: {e}")
+        llm_response = await self._call_model(messages)
         
         # Parse response
         result = self._parse_response(llm_response)
@@ -168,6 +180,21 @@ async def evaluate(
         
         return result
     
+    async def _call_model(self, messages: List[Dict[str, str]]) -> str:
+        """
+        Call the model with the given messages.
+        """
+        try:
+            if self.config.use_chat_api:
+                llm_response = await self.client.chat_completion(messages)
+            else:
+                prompt = PromptBuilder.format_messages_as_text(messages)
+                llm_response = await self.client.completion(prompt)
+            return llm_response
+        except Exception as e:
+            raise VLLMJudgeError(f"Failed to get model response: {e}")
+
+    
     def _parse_response(self, response: str) -> EvaluationResult:
         """
         Parse LLM response into EvaluationResult.
diff --git a/src/vllm_judge/metrics.py b/src/vllm_judge/metrics.py
@@ -1,5 +1,6 @@
 from typing import Dict
-from vllm_judge.models import Metric,TemplateEngine
+from vllm_judge.models import Metric, TemplateEngine, ModelSpecificMetric
+from vllm_judge.utils import parse_llama_guard_3
 
 # Registry for built-in metrics
 BUILTIN_METRICS: Dict[str, Metric] = {}
@@ -11,6 +12,13 @@ def create_builtin_metric(metric: Metric) -> Metric:
     return metric
 
 
+# Llama Guard 3 safety metric
+LLAMA_GUARD_3_SAFETY = create_builtin_metric(ModelSpecificMetric(
+    name="llama_guard_3_safety",
+    model_pattern="llama_guard_3",
+    parser_func=parse_llama_guard_3
+))
+
 # General purpose metrics
 HELPFULNESS = create_builtin_metric(Metric(
     name="helpfulness",
diff --git a/src/vllm_judge/models.py b/src/vllm_judge/models.py
@@ -1,4 +1,4 @@
-from typing import Optional, Any, Dict, Union, List, Tuple
+from typing import Optional, Any, Dict, Union, List, Tuple, Callable
 from pydantic import BaseModel, Field, field_validator, ConfigDict
 from enum import Enum
 
@@ -159,6 +159,15 @@ def _auto_detect_required_vars(self):
     def __repr__(self):
         return f"Metric(name='{self.name}', criteria='{self.criteria}', template_engine='{self.template_engine}')"
 
+# Base class for model-specific metrics
+class ModelSpecificMetric(Metric):
+    """Metric that bypasses our prompt formatting."""
+    
+    def __init__(self, name: str, model_pattern: str, parser_func: Callable[[str], EvaluationResult]):
+        super().__init__(name=name, criteria="model-specific evaluation")
+        self.model_pattern = model_pattern
+        self.parser_func = parser_func
+        # self.is_model_specific = True  # Flag for special handling
 
 class BatchResult(BaseModel):
     """Result of batch evaluation."""
diff --git a/src/vllm_judge/utils.py b/src/vllm_judge/utils.py
@@ -0,0 +1,14 @@
+from vllm_judge.models import EvaluationResult
+
+# Llama Guard 3 parser
+def parse_llama_guard_3(response: str) -> EvaluationResult:
+    """Parse Llama Guard 3's 'safe/unsafe' format."""
+    lines = response.strip().split('\n')
+    is_safe = lines[0].lower().strip() == 'safe'
+    
+    return EvaluationResult(
+        decision="safe" if is_safe else "unsafe",
+        reasoning=lines[1] if len(lines) > 1 else "No violations detected",
+        score=None,
+        metadata={"model_type": "llama_guard_3"}
+    )