Skip to content

feat(RHOAIENG-28840): Support '/api/v1/text/generation' detections #23

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion detectors/Dockerfile.judge
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ RUN echo "$CACHEBUST"
COPY ./common /app/detectors/common
COPY ./llm_judge/app.py /app/detectors/llm_judge/app.py
COPY ./llm_judge/detector.py /app/detectors/llm_judge/detector.py
COPY ./llm_judge/scheme.py /app/detectors/llm_judge/scheme.py
RUN touch /app/detectors/llm_judge/__init__.py

EXPOSE 8000
Expand Down
27 changes: 26 additions & 1 deletion detectors/common/scheme.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from enum import Enum
from typing import Dict, List, Optional
from typing import Any, Dict, List, Optional

from pydantic import BaseModel, Field, RootModel

Expand Down Expand Up @@ -134,6 +134,7 @@ class ContentAnalysisResponse(BaseModel):
description="Optional field providing evidences for the provided detection",
default=None,
)
metadata: Optional[Dict[str, Any]] = Field(default_factory=dict, description="Additional metadata from evaluation")


class ContentsAnalysisResponse(RootModel):
Expand All @@ -145,3 +146,27 @@ class ContentsAnalysisResponse(RootModel):
class Error(BaseModel):
code: int
message: str

class MetricsListResponse(BaseModel):
"""Response for listing available metrics."""
metrics: List[str] = Field(description="List of available metric names")
total: int = Field(description="Total number of available metrics")

class GenerationAnalysisHttpRequest(BaseModel):
prompt: str = Field(description="Prompt is the user input to the LLM", example="What do you think about the future of AI?")
generated_text: str = Field(description="Generated response from the LLM", example="The future of AI is bright but we need to be careful about the risks.")
detector_params: Optional[Dict[str, Any]] = Field(
default_factory=dict,
description="Detector parameters for evaluation (e.g., metric, criteria, etc.)",
example={"metric": "safety"}
)

class GenerationAnalysisResponse(BaseModel):
detection: str = Field(example="safe")
detection_type: str = Field(example="llm_judge")
score: float = Field(example=0.8)
evidences: Optional[List[EvidenceObj]] = Field(
description="Optional field providing evidences for the provided detection",
default=[],
)
metadata: Optional[Dict[str, Any]] = Field(default_factory=dict, description="Additional metadata from evaluation")
30 changes: 26 additions & 4 deletions detectors/llm_judge/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,13 @@

from detectors.common.app import DetectorBaseAPI as FastAPI
from detectors.llm_judge.detector import LLMJudgeDetector
from detectors.llm_judge.scheme import (
from detectors.common.scheme import (
ContentAnalysisHttpRequest,
ContentsAnalysisResponse,
MetricsListResponse,
Error,
GenerationAnalysisHttpRequest,
GenerationAnalysisResponse,
)


Expand All @@ -35,23 +37,43 @@ async def lifespan(app: FastAPI):
"/api/v1/text/contents",
response_model=ContentsAnalysisResponse,
description="""LLM-as-Judge detector that evaluates content using various metrics like safety, toxicity, accuracy, helpfulness, etc. \
The metric parameter allows you to specify which evaluation criteria to use. \
The metric detector_params parameter allows you to specify which evaluation criteria to use. \
Supports all built-in vllm_judge metrics including safety, accuracy, helpfulness, clarity, and many more.""",
responses={
404: {"model": Error, "description": "Resource Not Found"},
422: {"model": Error, "description": "Validation Error"},
},
)
async def detector_unary_handler(
async def detector_content_analysis_handler(
request: ContentAnalysisHttpRequest,
detector_id: Annotated[str, Header(example="llm_judge_safety")],
):
"""Analyze content using LLM-as-Judge evaluation."""
detector: LLMJudgeDetector = app.get_detector()
if not detector:
raise HTTPException(status_code=503, detail="Detector not found")
return ContentsAnalysisResponse(root=await detector.run(request))
return ContentsAnalysisResponse(root=await detector.analyze_content(request))

@app.post(
"/api/v1/text/generation",
response_model=GenerationAnalysisResponse,
description="""Analyze a single generation using the specified metric. \
The metric detector_params parameter allows you to specify which evaluation criteria to use. \
Supports all built-in vllm_judge metrics including safety, accuracy, helpfulness, clarity, and many more.""",
responses={
404: {"model": Error, "description": "Resource Not Found"},
422: {"model": Error, "description": "Validation Error"},
},
)
async def detector_generation_analysis_handler(
request: GenerationAnalysisHttpRequest,
detector_id: Annotated[str, Header(example="llm_judge_safety")],
):
"""Analyze a single generation using LLM-as-Judge evaluation."""
detector: LLMJudgeDetector = app.get_detector()
if not detector:
raise HTTPException(status_code=503, detail="Detector not found")
return await detector.analyze_generation(request)

@app.get(
"/api/v1/metrics",
Expand Down
101 changes: 81 additions & 20 deletions detectors/llm_judge/detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
from vllm_judge import Judge, EvaluationResult, BUILTIN_METRICS
from vllm_judge.exceptions import MetricNotFoundError
from detectors.common.app import logger
from detectors.llm_judge.scheme import (
from detectors.common.scheme import (
ContentAnalysisHttpRequest,
ContentAnalysisResponse,
ContentsAnalysisResponse,
GenerationAnalysisHttpRequest,
GenerationAnalysisResponse,
)


Expand Down Expand Up @@ -40,33 +42,49 @@ def _initialize_judge(self) -> None:
logger.error(f"Failed to detect model: {e}")
raise

async def evaluate_single_content(self, content: str, params: Dict[str, Any]) -> ContentAnalysisResponse:
def _validate_params(self, params: Dict[str, Any]) -> Dict[str, Any]:
"""
Evaluate a single piece of content using the specified metric.

Args:
content: Text content to evaluate
params: vLLM Judge parameters for the evaluation

Returns:
ContentAnalysisResponse with evaluation results
Make sure the params have valid metric/criteria and scale.
"""
if "metric" not in params:
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue (code-quality): We've found these issues:

if "criteria" not in params:
params["metric"] = "safety" # Default to safety
elif "scale" not in params:
params["scale"] = (0, 1) # Default to 0-1 scale

if "metric" in params:
else:
if params["metric"] not in self.available_metrics:
raise MetricNotFoundError(
f"Metric '{params['metric']}' not found. Available metrics: {', '.join(sorted(self.available_metrics))}"
)
judge_metric = BUILTIN_METRICS[params["metric"]]
if judge_metric.scale is None:
params["scale"] = (0, 1) # Default to 0-1 scale

return params

def _get_score(self, result: EvaluationResult) -> float:
"""
Get the score from the evaluation result.
"""
if isinstance(result.decision, (int, float)) or result.score is not None:
return float(result.score if result.score is not None else result.decision)
logger.warning(f"Score is not a number: '{result.score}'. Defaulting to 0.0")
return 0.0 # FIXME: default to 0 because of non-optional field in schema

async def evaluate_single_content(self, content: str, params: Dict[str, Any]) -> ContentAnalysisResponse:
"""
Evaluate a single piece of content using the specified metric.

Args:
content: Text content to evaluate
params: vLLM Judge parameters for the evaluation

Returns:
ContentAnalysisResponse with evaluation results
"""
params: Dict[str, Any] = self._validate_params(params)

evaluation_params = {
evaluation_params: Dict[str, Any] = {
"content": content,
**params
}
Expand All @@ -76,11 +94,8 @@ async def evaluate_single_content(self, content: str, params: Dict[str, Any]) ->
**evaluation_params
)

# Convert to response format
score = None
if isinstance(result.decision, (int, float)) or result.score is not None:
# Numeric result
score = float(result.score if result.score is not None else result.decision)
# Convert to response format.
score: float = self._get_score(result)

return ContentAnalysisResponse(
start=0,
Expand All @@ -93,12 +108,12 @@ async def evaluate_single_content(self, content: str, params: Dict[str, Any]) ->
metadata={"reasoning": result.reasoning}
)

async def run(self, request: ContentAnalysisHttpRequest) -> ContentsAnalysisResponse:
async def analyze_content(self, request: ContentAnalysisHttpRequest) -> ContentsAnalysisResponse:
"""
Run content analysis for each input text.

Args:
request: Input request containing texts and metric to analyze
request: Input request containing texts and optional metric to analyze

Returns:
ContentsAnalysisResponse: The aggregated response for all input texts
Expand All @@ -111,7 +126,53 @@ async def run(self, request: ContentAnalysisHttpRequest) -> ContentsAnalysisResp
contents_analyses.append([analysis]) # Wrap in list to match schema

return contents_analyses

async def evaluate_single_generation(self, prompt: str, generated_text: str, params: Dict[str, Any]) -> GenerationAnalysisResponse:
"""
Evaluate a single generation based on the prompt and generated text.

Args:
prompt: Prompt to the LLM
generated_text: Generated text from the LLM
params: vLLM Judge parameters for the evaluation

Returns:
GenerationAnalysisResponse: The response for the generation analysis
"""
params: Dict[str, Any] = self._validate_params(params)
evaluation_params: Dict[str, Any] = {
"input": prompt,
"content": generated_text,
**params
}

result: EvaluationResult = await self.judge.evaluate(
**evaluation_params
)

score: float = self._get_score(result)

return GenerationAnalysisResponse(
detection=str(result.decision),
detection_type="llm_judge",
score=score,
evidences=[],
metadata={"reasoning": result.reasoning}
)

async def analyze_generation(self, request: GenerationAnalysisHttpRequest) -> GenerationAnalysisResponse:
"""
Analyze a single generation based on the prompt and generated text.

Args:
request: Input request containing prompt, generated text and optional metric to analyze

Returns:
GenerationAnalysisResponse: The response for the generation analysis
"""
return await self.evaluate_single_generation(prompt=request.prompt,
generated_text=request.generated_text,
params=request.detector_params)

async def close(self):
"""Close the judge client."""
Expand Down
3 changes: 1 addition & 2 deletions detectors/llm_judge/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
vllm-judge[jinja2]==0.1.6
pyyaml==6.0.2
vllm-judge[jinja2]==0.1.8
74 changes: 0 additions & 74 deletions detectors/llm_judge/scheme.py

This file was deleted.

Loading
Loading