Skip to content

Commit ff88f67

Browse files
authored
Merge pull request #13 from saichandrapandraju/vllm-judge
RHOAIENG-23163: Enable LLM-as-a-judge detections from vLLM hosted models
2 parents e41a429 + 3ca565f commit ff88f67

File tree

14 files changed

+753
-1
lines changed

14 files changed

+753
-1
lines changed

detectors/Dockerfile.judge

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
FROM registry.access.redhat.com/ubi9/ubi-minimal as base
2+
RUN microdnf update -y && \
3+
microdnf install -y --nodocs \
4+
python-pip python-devel && \
5+
pip install --upgrade --no-cache-dir pip wheel && \
6+
microdnf clean all
7+
8+
FROM base as builder
9+
10+
COPY ./common/requirements.txt .
11+
RUN pip install --no-cache-dir -r requirements.txt
12+
13+
COPY ./llm_judge/requirements.txt .
14+
RUN pip install --no-cache-dir -r requirements.txt
15+
16+
FROM builder
17+
18+
WORKDIR /app
19+
ARG CACHEBUST=1
20+
RUN echo "$CACHEBUST"
21+
COPY ./common /app/detectors/common
22+
COPY ./llm_judge/app.py /app/detectors/llm_judge/app.py
23+
COPY ./llm_judge/detector.py /app/detectors/llm_judge/detector.py
24+
COPY ./llm_judge/scheme.py /app/detectors/llm_judge/scheme.py
25+
RUN touch /app/detectors/llm_judge/__init__.py
26+
27+
EXPOSE 8000
28+
CMD ["uvicorn", "detectors.llm_judge.app:app", "--workers", "4", "--host", "0.0.0.0", "--port", "8000", "--log-config", "/app/detectors/common/log_conf.yaml"]
29+
30+
# gunicorn main:app --workers 4 --worker-class uvicorn.workers.UvicornWorker --bind 0.0.0.0:8000

detectors/__init__.py

Whitespace-only changes.

detectors/llm_judge/README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# LLM Judge Detector
2+
3+
The LLM Judge detector integrates the [vLLM Judge](https://github.com/saichandrapandraju/vllm_judge) into the Guardrails Detector ecosystem.
4+
5+
```
6+
oc apply -f deploy/servingruntime.yaml
7+
oc apply -f deploy/isvc.yaml
8+
```

detectors/llm_judge/__init__.py

Whitespace-only changes.

detectors/llm_judge/app.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
from contextlib import asynccontextmanager
2+
from typing import Annotated, Dict
3+
4+
from fastapi import Header
5+
from prometheus_fastapi_instrumentator import Instrumentator
6+
7+
from detectors.common.app import DetectorBaseAPI as FastAPI
8+
from detectors.llm_judge.detector import LLMJudgeDetector
9+
from detectors.llm_judge.scheme import (
10+
ContentAnalysisHttpRequest,
11+
ContentsAnalysisResponse,
12+
MetricsListResponse,
13+
Error,
14+
)
15+
16+
detector_objects: Dict[str, LLMJudgeDetector] = {}
17+
18+
19+
@asynccontextmanager
20+
async def lifespan(app: FastAPI):
21+
"""Application lifespan management."""
22+
try:
23+
detector_objects["detector"] = LLMJudgeDetector()
24+
yield
25+
finally:
26+
# Clean up resources
27+
if "detector" in detector_objects:
28+
await detector_objects["detector"].close()
29+
detector_objects.clear()
30+
31+
32+
app = FastAPI(lifespan=lifespan, dependencies=[])
33+
Instrumentator().instrument(app).expose(app)
34+
35+
36+
@app.post(
37+
"/api/v1/text/contents",
38+
response_model=ContentsAnalysisResponse,
39+
description="""LLM-as-Judge detector that evaluates content using various metrics like safety, toxicity, accuracy, helpfulness, etc. \
40+
The metric parameter allows you to specify which evaluation criteria to use. \
41+
Supports all built-in vllm_judge metrics including safety, accuracy, helpfulness, clarity, and many more.""",
42+
responses={
43+
404: {"model": Error, "description": "Resource Not Found"},
44+
422: {"model": Error, "description": "Validation Error"},
45+
},
46+
)
47+
async def detector_unary_handler(
48+
request: ContentAnalysisHttpRequest,
49+
detector_id: Annotated[str, Header(example="llm_judge_safety")],
50+
):
51+
"""Analyze content using LLM-as-Judge evaluation."""
52+
return ContentsAnalysisResponse(root=await detector_objects["detector"].run(request))
53+
54+
55+
@app.get(
56+
"/api/v1/metrics",
57+
response_model=MetricsListResponse,
58+
description="List all available metrics for LLM Judge evaluation",
59+
responses={
60+
404: {"model": Error, "description": "Resource Not Found"},
61+
},
62+
)
63+
async def list_metrics():
64+
"""List all available evaluation metrics."""
65+
detector = detector_objects.get("detector")
66+
if not detector:
67+
return {"metrics": [], "total": 0}
68+
69+
metrics = detector.list_available_metrics()
70+
return MetricsListResponse(metrics=metrics, total=len(metrics))

detectors/llm_judge/deploy/isvc.yaml

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
apiVersion: serving.kserve.io/v1beta1
2+
kind: InferenceService
3+
metadata:
4+
name: guardrails-detector-llm-judge
5+
namespace: model-namespace
6+
labels:
7+
opendatahub.io/dashboard: 'true'
8+
annotations:
9+
openshift.io/display-name: guardrails-detector-llm-judge
10+
security.opendatahub.io/enable-auth: 'true'
11+
serving.knative.openshift.io/enablePassthrough: 'true'
12+
sidecar.istio.io/inject: 'true'
13+
sidecar.istio.io/rewriteAppHTTPProbers: 'true'
14+
serving.kserve.io/deploymentMode: RawDeployment
15+
spec:
16+
predictor:
17+
maxReplicas: 1
18+
minReplicas: 1
19+
model:
20+
modelFormat:
21+
name: guardrails-detector-llm-judge
22+
name: ''
23+
runtime: guardrails-detector-runtime-judge
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
apiVersion: serving.kserve.io/v1alpha1
2+
kind: ServingRuntime
3+
metadata:
4+
name: guardrails-detector-runtime-judge
5+
namespace: model-namespace
6+
annotations:
7+
openshift.io/display-name: Guardrails LLM Judge Detector ServingRuntime for KServe
8+
labels:
9+
opendatahub.io/dashboard: 'true'
10+
spec:
11+
annotations:
12+
prometheus.io/port: '8080'
13+
prometheus.io/path: '/metrics'
14+
multiModel: false
15+
supportedModelFormats:
16+
- autoSelect: true
17+
name: guardrails-detector-llm-judge
18+
containers:
19+
- name: kserve-container
20+
image: quay.io/spandraj/guardrails-detector-judge:latest
21+
command:
22+
- uvicorn
23+
- detectors.llm_judge.app:app
24+
args:
25+
- "--workers"
26+
- "1"
27+
- "--host"
28+
- "0.0.0.0"
29+
- "--port"
30+
- "8000"
31+
- "--log-config"
32+
- "/app/detectors/common/log_conf.yaml"
33+
env:
34+
- name: VLLM_BASE_URL
35+
value: "http://qwen2-predictor:8080" # <-- Change this to your vLLM URL
36+
ports:
37+
- containerPort: 8000
38+
protocol: TCP
39+
resources:
40+
requests:
41+
memory: "5Gi"
42+
cpu: "1"
43+
limits:
44+
memory: "10Gi"
45+
cpu: "2"

detectors/llm_judge/detector.py

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
import os
2+
from typing import List, Dict, Any
3+
4+
from vllm_judge import Judge, EvaluationResult, BUILTIN_METRICS
5+
from vllm_judge.exceptions import MetricNotFoundError
6+
from detectors.common.app import logger
7+
from detectors.llm_judge.scheme import (
8+
ContentAnalysisHttpRequest,
9+
ContentAnalysisResponse,
10+
ContentsAnalysisResponse,
11+
)
12+
13+
14+
class LLMJudgeDetector:
15+
"""LLM-as-Judge detector for evaluating content using vllm_judge."""
16+
17+
def __init__(self) -> None:
18+
"""Initialize the LLM Judge Detector."""
19+
self.judge = None
20+
self.available_metrics = set(BUILTIN_METRICS.keys())
21+
22+
# Get configuration from environment
23+
self.vllm_base_url = os.environ.get("VLLM_BASE_URL")
24+
25+
if not self.vllm_base_url:
26+
raise ValueError("VLLM_BASE_URL environment variable is required")
27+
28+
logger.info(f"Initializing LLM Judge with URL: {self.vllm_base_url}")
29+
30+
self._initialize_judge()
31+
32+
def _initialize_judge(self) -> None:
33+
"""Initialize the vLLM Judge."""
34+
try:
35+
self.judge = Judge.from_url(base_url=self.vllm_base_url)
36+
logger.info(f"LLM Judge initialized successfully with model: {self.judge.config.model} and base url: {self.judge.config.base_url}")
37+
logger.info(f"Available metrics: {', '.join(sorted(self.available_metrics))}")
38+
39+
except Exception as e:
40+
logger.error(f"Failed to detect model: {e}")
41+
raise
42+
43+
async def evaluate_single_content(self, content: str, params: Dict[str, Any]) -> ContentAnalysisResponse:
44+
"""
45+
Evaluate a single piece of content using the specified metric.
46+
47+
Args:
48+
content: Text content to evaluate
49+
params: vLLM Judge parameters for the evaluation
50+
51+
Returns:
52+
ContentAnalysisResponse with evaluation results
53+
"""
54+
if "metric" not in params:
55+
if "criteria" not in params:
56+
params["metric"] = "safety" # Default to safety
57+
elif "scale" not in params:
58+
params["scale"] = (0, 1) # Default to 0-1 scale
59+
60+
if "metric" in params:
61+
if params["metric"] not in self.available_metrics:
62+
raise MetricNotFoundError(
63+
f"Metric '{params['metric']}' not found. Available metrics: {', '.join(sorted(self.available_metrics))}"
64+
)
65+
judge_metric = BUILTIN_METRICS[params["metric"]]
66+
if judge_metric.scale is None:
67+
params["scale"] = (0, 1) # Default to 0-1 scale
68+
69+
evaluation_params = {
70+
"content": content,
71+
**params
72+
}
73+
74+
# Perform evaluation
75+
result: EvaluationResult = await self.judge.evaluate(
76+
**evaluation_params
77+
)
78+
79+
# Convert to response format
80+
score = None
81+
if isinstance(result.decision, (int, float)) or result.score is not None:
82+
# Numeric result
83+
score = float(result.score if result.score is not None else result.decision)
84+
85+
return ContentAnalysisResponse(
86+
start=0,
87+
end=len(content),
88+
detection=str(result.decision),
89+
detection_type="llm_judge",
90+
score=score,
91+
text=content,
92+
evidences=[],
93+
metadata={"reasoning": result.reasoning}
94+
)
95+
96+
async def run(self, request: ContentAnalysisHttpRequest) -> ContentsAnalysisResponse:
97+
"""
98+
Run content analysis for each input text.
99+
100+
Args:
101+
request: Input request containing texts and metric to analyze
102+
103+
Returns:
104+
ContentsAnalysisResponse: The aggregated response for all input texts
105+
"""
106+
107+
contents_analyses = []
108+
109+
for content in request.contents:
110+
analysis = await self.evaluate_single_content(content, request.detector_params)
111+
contents_analyses.append([analysis]) # Wrap in list to match schema
112+
113+
return contents_analyses
114+
115+
116+
async def close(self):
117+
"""Close the judge client."""
118+
if self.judge:
119+
await self.judge.close()
120+
121+
def list_available_metrics(self) -> List[str]:
122+
"""Return list of available metrics."""
123+
return sorted(list(self.available_metrics))

detectors/llm_judge/requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
vllm-judge==0.1.6
2+
pyyaml==6.0.2

detectors/llm_judge/scheme.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
from enum import Enum
2+
from typing import List, Optional, Dict, Any
3+
from pydantic import BaseModel, Field, RootModel
4+
5+
6+
class Evidence(BaseModel):
7+
source: str = Field(
8+
title="Source",
9+
example="https://en.wikipedia.org/wiki/IBM",
10+
description="Source of the evidence, it can be url of the evidence etc",
11+
)
12+
13+
14+
class EvidenceType(str, Enum):
15+
url = "url"
16+
title = "title"
17+
18+
19+
class EvidenceObj(BaseModel):
20+
type: EvidenceType = Field(
21+
title="EvidenceType",
22+
example="url",
23+
description="Type field signifying the type of evidence provided. Example url, title etc",
24+
)
25+
evidence: Evidence = Field(
26+
description="Evidence object, currently only containing source, but in future can contain other optional arguments like id, etc",
27+
)
28+
29+
30+
class ContentAnalysisHttpRequest(BaseModel):
31+
contents: List[str] = Field(
32+
min_length=1,
33+
title="Contents",
34+
description="Field allowing users to provide list of texts for analysis. Note, results of this endpoint will contain analysis / detection of each of the provided text in the order they are present in the contents object.",
35+
example=[
36+
"Martians are like crocodiles; the more you give them meat, the more they want"
37+
],
38+
)
39+
detector_params: Optional[Dict[str, Any]] = Field(
40+
default_factory=dict,
41+
description="Detector parameters for evaluation (e.g., metric, criteria, etc.)",
42+
example={"metric": "safety"}
43+
)
44+
45+
46+
class ContentAnalysisResponse(BaseModel):
47+
start: int = Field(example=0)
48+
end: int = Field(example=75)
49+
text: str = Field(example="This is a safe and helpful response")
50+
detection: str = Field(example="vllm_model")
51+
detection_type: str = Field(example="llm_judge")
52+
score: float = Field(example=0.8)
53+
evidences: Optional[List[EvidenceObj]] = Field(
54+
description="Optional field providing evidences for the provided detection",
55+
default=[],
56+
)
57+
metadata: Optional[Dict[str, Any]] = Field(default_factory=dict, description="Additional metadata from evaluation")
58+
59+
60+
class ContentsAnalysisResponse(RootModel):
61+
root: List[List[ContentAnalysisResponse]] = Field(
62+
title="Response Text Content Analysis LLM Judge"
63+
)
64+
65+
66+
class Error(BaseModel):
67+
code: int
68+
message: str
69+
70+
71+
class MetricsListResponse(BaseModel):
72+
"""Response for listing available metrics."""
73+
metrics: List[str] = Field(description="List of available metric names")
74+
total: int = Field(description="Total number of available metrics")

0 commit comments

Comments
 (0)