Skip to content

Commit 2a878b0

Browse files
Add '/api/v1/text/generation' FMS Detector API support
1 parent 0a3bd13 commit 2a878b0

File tree

3 files changed

+122
-23
lines changed

3 files changed

+122
-23
lines changed

detectors/llm_judge/app.py

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
ContentsAnalysisResponse,
1212
MetricsListResponse,
1313
Error,
14+
GenerationAnalysisHttpRequest,
15+
GenerationAnalysisResponse,
1416
)
1517

1618

@@ -35,21 +37,39 @@ async def lifespan(app: FastAPI):
3537
"/api/v1/text/contents",
3638
response_model=ContentsAnalysisResponse,
3739
description="""LLM-as-Judge detector that evaluates content using various metrics like safety, toxicity, accuracy, helpfulness, etc. \
38-
The metric parameter allows you to specify which evaluation criteria to use. \
40+
The metric detector_params parameter allows you to specify which evaluation criteria to use. \
3941
Supports all built-in vllm_judge metrics including safety, accuracy, helpfulness, clarity, and many more.""",
4042
responses={
4143
404: {"model": Error, "description": "Resource Not Found"},
4244
422: {"model": Error, "description": "Validation Error"},
4345
},
4446
)
45-
async def detector_unary_handler(
47+
async def detector_content_analysis_handler(
4648
request: ContentAnalysisHttpRequest,
4749
detector_id: Annotated[str, Header(example="llm_judge_safety")],
4850
):
4951
"""Analyze content using LLM-as-Judge evaluation."""
5052
detector: LLMJudgeDetector = app.get_detector()
51-
return ContentsAnalysisResponse(root=await detector.run(request))
53+
return ContentsAnalysisResponse(root=await detector.analyze_content(request))
5254

55+
@app.post(
56+
"/api/v1/text/generation",
57+
response_model=GenerationAnalysisResponse,
58+
description="""Analyze a single generation using the specified metric. \
59+
The metric detector_params parameter allows you to specify which evaluation criteria to use. \
60+
Supports all built-in vllm_judge metrics including safety, accuracy, helpfulness, clarity, and many more.""",
61+
responses={
62+
404: {"model": Error, "description": "Resource Not Found"},
63+
422: {"model": Error, "description": "Validation Error"},
64+
},
65+
)
66+
async def detector_generation_analysis_handler(
67+
request: GenerationAnalysisHttpRequest,
68+
detector_id: Annotated[str, Header(example="llm_judge_safety")],
69+
):
70+
"""Analyze a single generation using LLM-as-Judge evaluation."""
71+
detector: LLMJudgeDetector = app.get_detector()
72+
return await detector.analyze_generation(request)
5373

5474
@app.get(
5575
"/api/v1/metrics",

detectors/llm_judge/detector.py

Lines changed: 79 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
ContentAnalysisHttpRequest,
99
ContentAnalysisResponse,
1010
ContentsAnalysisResponse,
11+
GenerationAnalysisHttpRequest,
12+
GenerationAnalysisResponse,
1113
)
1214

1315

@@ -40,33 +42,48 @@ def _initialize_judge(self) -> None:
4042
logger.error(f"Failed to detect model: {e}")
4143
raise
4244

43-
async def evaluate_single_content(self, content: str, params: Dict[str, Any]) -> ContentAnalysisResponse:
45+
def _validate_params(self, params: Dict[str, Any]) -> Dict[str, Any]:
4446
"""
45-
Evaluate a single piece of content using the specified metric.
46-
47-
Args:
48-
content: Text content to evaluate
49-
params: vLLM Judge parameters for the evaluation
50-
51-
Returns:
52-
ContentAnalysisResponse with evaluation results
47+
Make sure the params have valid metric/criteria and scale.
5348
"""
5449
if "metric" not in params:
5550
if "criteria" not in params:
5651
params["metric"] = "safety" # Default to safety
5752
elif "scale" not in params:
5853
params["scale"] = (0, 1) # Default to 0-1 scale
59-
60-
if "metric" in params:
54+
else:
6155
if params["metric"] not in self.available_metrics:
6256
raise MetricNotFoundError(
6357
f"Metric '{params['metric']}' not found. Available metrics: {', '.join(sorted(self.available_metrics))}"
6458
)
6559
judge_metric = BUILTIN_METRICS[params["metric"]]
6660
if judge_metric.scale is None:
6761
params["scale"] = (0, 1) # Default to 0-1 scale
62+
63+
return params
64+
65+
def _get_score(self, result: EvaluationResult) -> float:
66+
"""
67+
Get the score from the evaluation result.
68+
"""
69+
if isinstance(result.decision, (int, float)) or result.score is not None:
70+
return float(result.score if result.score is not None else result.decision)
71+
return 0.0 # FIXME: default to 0 because of non-optional field in schema
72+
73+
async def evaluate_single_content(self, content: str, params: Dict[str, Any]) -> ContentAnalysisResponse:
74+
"""
75+
Evaluate a single piece of content using the specified metric.
76+
77+
Args:
78+
content: Text content to evaluate
79+
params: vLLM Judge parameters for the evaluation
80+
81+
Returns:
82+
ContentAnalysisResponse with evaluation results
83+
"""
84+
params: Dict[str, Any] = self._validate_params(params)
6885

69-
evaluation_params = {
86+
evaluation_params: Dict[str, Any] = {
7087
"content": content,
7188
**params
7289
}
@@ -76,11 +93,8 @@ async def evaluate_single_content(self, content: str, params: Dict[str, Any]) ->
7693
**evaluation_params
7794
)
7895

79-
# Convert to response format
80-
score = None
81-
if isinstance(result.decision, (int, float)) or result.score is not None:
82-
# Numeric result
83-
score = float(result.score if result.score is not None else result.decision)
96+
# Convert to response format.
97+
score: float = self._get_score(result)
8498

8599
return ContentAnalysisResponse(
86100
start=0,
@@ -93,12 +107,12 @@ async def evaluate_single_content(self, content: str, params: Dict[str, Any]) ->
93107
metadata={"reasoning": result.reasoning}
94108
)
95109

96-
async def run(self, request: ContentAnalysisHttpRequest) -> ContentsAnalysisResponse:
110+
async def analyze_content(self, request: ContentAnalysisHttpRequest) -> ContentsAnalysisResponse:
97111
"""
98112
Run content analysis for each input text.
99113
100114
Args:
101-
request: Input request containing texts and metric to analyze
115+
request: Input request containing texts and optional metric to analyze
102116
103117
Returns:
104118
ContentsAnalysisResponse: The aggregated response for all input texts
@@ -111,7 +125,53 @@ async def run(self, request: ContentAnalysisHttpRequest) -> ContentsAnalysisResp
111125
contents_analyses.append([analysis]) # Wrap in list to match schema
112126

113127
return contents_analyses
128+
129+
async def evaluate_single_generation(self, prompt: str, generated_text: str, params: Dict[str, Any]) -> GenerationAnalysisResponse:
130+
"""
131+
Evaluate a single generation based on the prompt and generated text.
132+
133+
Args:
134+
prompt: Prompt to the LLM
135+
generated_text: Generated text from the LLM
136+
params: vLLM Judge parameters for the evaluation
137+
138+
Returns:
139+
GenerationAnalysisResponse: The response for the generation analysis
140+
"""
141+
params: Dict[str, Any] = self._validate_params(params)
142+
evaluation_params: Dict[str, Any] = {
143+
"input": prompt,
144+
"content": generated_text,
145+
**params
146+
}
147+
148+
result: EvaluationResult = await self.judge.evaluate(
149+
**evaluation_params
150+
)
151+
152+
score: float = self._get_score(result)
153+
154+
return GenerationAnalysisResponse(
155+
detection=str(result.decision),
156+
detection_type="llm_judge",
157+
score=score,
158+
evidences=[],
159+
metadata={"reasoning": result.reasoning}
160+
)
161+
162+
async def analyze_generation(self, request: GenerationAnalysisHttpRequest) -> GenerationAnalysisResponse:
163+
"""
164+
Analyze a single generation based on the prompt and generated text.
165+
166+
Args:
167+
request: Input request containing prompt, generated text and optional metric to analyze
114168
169+
Returns:
170+
GenerationAnalysisResponse: The response for the generation analysis
171+
"""
172+
return await self.evaluate_single_generation(prompt=request.prompt,
173+
generated_text=request.generated_text,
174+
params=request.detector_params)
115175

116176
async def close(self):
117177
"""Close the judge client."""

detectors/llm_judge/scheme.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,4 +71,23 @@ class Error(BaseModel):
7171
class MetricsListResponse(BaseModel):
7272
"""Response for listing available metrics."""
7373
metrics: List[str] = Field(description="List of available metric names")
74-
total: int = Field(description="Total number of available metrics")
74+
total: int = Field(description="Total number of available metrics")
75+
76+
class GenerationAnalysisHttpRequest(BaseModel):
77+
prompt: str = Field(description="Prompt is the user input to the LLM", example="What do you think about the future of AI?")
78+
generated_text: str = Field(description="Generated response from the LLM", example="The future of AI is bright but we need to be careful about the risks.")
79+
detector_params: Optional[Dict[str, Any]] = Field(
80+
default_factory=dict,
81+
description="Detector parameters for evaluation (e.g., metric, criteria, etc.)",
82+
example={"metric": "safety"}
83+
)
84+
85+
class GenerationAnalysisResponse(BaseModel):
86+
detection: str = Field(example="safe")
87+
detection_type: str = Field(example="llm_judge")
88+
score: float = Field(example=0.8)
89+
evidences: Optional[List[EvidenceObj]] = Field(
90+
description="Optional field providing evidences for the provided detection",
91+
default=[],
92+
)
93+
metadata: Optional[Dict[str, Any]] = Field(default_factory=dict, description="Additional metadata from evaluation")

0 commit comments

Comments
 (0)