Skip to content

Commit ab5a358

Browse files
add tests for generation analysis
1 parent 315a437 commit ab5a358

File tree

2 files changed

+362
-12
lines changed

2 files changed

+362
-12
lines changed

tests/detectors/llm_judge/test_llm_judge_detector.py

Lines changed: 306 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,17 @@
99
from detectors.llm_judge.scheme import (
1010
ContentAnalysisHttpRequest,
1111
ContentAnalysisResponse,
12+
GenerationAnalysisHttpRequest,
13+
GenerationAnalysisResponse,
1214
)
1315

1416
# Import vLLM Judge components for mocking
1517
from vllm_judge import EvaluationResult
1618
from vllm_judge.exceptions import MetricNotFoundError
1719

1820

19-
class TestLLMJudgeDetector:
20-
"""Test suite for LLMJudgeDetector."""
21+
class TestLLMJudgeDetectorContentAnalysis:
22+
"""Test suite for LLMJudgeDetector content analysis."""
2123

2224
@pytest.fixture
2325
def mock_judge_result(self) -> EvaluationResult:
@@ -73,6 +75,16 @@ def test_detector_initialization_unreachable_url(self) -> None:
7375
with pytest.raises(Exception, match="Failed to detect model"):
7476
LLMJudgeDetector()
7577

78+
def test_close_detector(self, detector_with_mock_judge: Tuple[LLMJudgeDetector, AsyncMock]) -> None:
79+
"""Test closing the detector properly closes the judge."""
80+
detector: LLMJudgeDetector
81+
mock_judge: AsyncMock
82+
detector, mock_judge = detector_with_mock_judge
83+
84+
asyncio.run(detector.close())
85+
86+
mock_judge.close.assert_called_once()
87+
7688
def test_evaluate_single_content_basic_metric(self, detector_with_mock_judge: Tuple[LLMJudgeDetector, AsyncMock]) -> None:
7789
"""Test basic evaluation with just a metric."""
7890
detector: LLMJudgeDetector
@@ -182,7 +194,7 @@ def test_run_single_content(self, detector_with_mock_judge: Tuple[LLMJudgeDetect
182194
detector_params={"metric": "safety"}
183195
)
184196

185-
result = asyncio.run(detector.run(request))
197+
result = asyncio.run(detector.analyze_content(request))
186198

187199
assert len(result) == 1
188200
assert len(result[0]) == 1
@@ -216,7 +228,7 @@ def test_run_multiple_contents(self, detector_with_mock_judge: Tuple[LLMJudgeDet
216228
detector_params={"metric": "safety"}
217229
)
218230

219-
result = asyncio.run(detector.run(request))
231+
result = asyncio.run(detector.analyze_content(request))
220232

221233
assert len(result) == 3
222234
for i, analysis_list in enumerate(result):
@@ -252,7 +264,7 @@ def test_run_with_custom_evaluation_params(self, detector_with_mock_judge: Tuple
252264
detector_params=custom_evaluation_params
253265
)
254266

255-
result = asyncio.run(detector.run(request))
267+
result = asyncio.run(detector.analyze_content(request))
256268

257269
# Verify complex parameters were passed correctly
258270
expected_call_params = {
@@ -285,3 +297,292 @@ def test_close_detector(self, detector_with_mock_judge: Tuple[LLMJudgeDetector,
285297
asyncio.run(detector.close())
286298

287299
mock_judge.close.assert_called_once()
300+
301+
302+
class TestLLMJudgeDetectorGenerationAnalysis:
303+
"""Test suite for LLMJudgeDetector generation analysis."""
304+
305+
@pytest.fixture
306+
def mock_judge_result(self) -> EvaluationResult:
307+
"""Mock EvaluationResult for generation testing."""
308+
return EvaluationResult(
309+
decision="HELPFUL",
310+
reasoning="This generated response is helpful and addresses the user's question appropriately.",
311+
score=0.85,
312+
metadata={"model": "test-model"}
313+
)
314+
315+
@pytest.fixture
316+
def detector_with_mock_judge(self, mock_judge_result) -> Tuple[LLMJudgeDetector, AsyncMock]:
317+
"""Create detector with mocked Judge."""
318+
with patch.dict(os.environ, {"VLLM_BASE_URL": "http://test:8000"}):
319+
with patch('vllm_judge.Judge.from_url') as mock_judge_class:
320+
# Create mock judge instance
321+
mock_judge_instance = AsyncMock()
322+
mock_judge_instance.evaluate = AsyncMock(return_value=mock_judge_result)
323+
mock_judge_instance.config.model = "test-model"
324+
mock_judge_instance.config.base_url = "http://test:8000"
325+
mock_judge_instance.close = AsyncMock()
326+
327+
mock_judge_class.return_value = mock_judge_instance
328+
329+
detector = LLMJudgeDetector()
330+
return detector, mock_judge_instance
331+
332+
def test_evaluate_single_generation_basic_metric(self, detector_with_mock_judge: Tuple[LLMJudgeDetector, AsyncMock]) -> None:
333+
"""Test basic generation evaluation with just a metric."""
334+
detector: LLMJudgeDetector
335+
mock_judge: AsyncMock
336+
detector, mock_judge = detector_with_mock_judge
337+
338+
prompt = "What is artificial intelligence?"
339+
generated_text = "Artificial intelligence (AI) refers to the simulation of human intelligence in machines."
340+
params = {"metric": "helpfulness"}
341+
342+
result = asyncio.run(detector.evaluate_single_generation(prompt, generated_text, params))
343+
344+
# Verify judge.evaluate was called correctly
345+
mock_judge.evaluate.assert_called_once_with(
346+
input=prompt,
347+
content=generated_text,
348+
metric="helpfulness"
349+
)
350+
351+
# Verify response format
352+
assert isinstance(result, GenerationAnalysisResponse)
353+
assert result.detection == "HELPFUL"
354+
assert result.score == 0.85
355+
assert result.detection_type == "llm_judge"
356+
assert "reasoning" in result.metadata
357+
358+
def test_evaluate_single_generation_full_parameters(self, detector_with_mock_judge: Tuple[LLMJudgeDetector, AsyncMock]) -> None:
359+
"""Test generation evaluation with all vLLM Judge parameters."""
360+
detector: LLMJudgeDetector
361+
mock_judge: AsyncMock
362+
detector, mock_judge = detector_with_mock_judge
363+
364+
prompt = "Explain quantum computing in simple terms"
365+
generated_text = "Quantum computing uses quantum bits (qubits) that can exist in multiple states simultaneously, allowing for parallel processing of information."
366+
params = {
367+
"criteria": "accuracy, clarity, and completeness",
368+
"rubric": "Score based on technical accuracy and accessibility",
369+
"scale": [1, 10],
370+
"examples": [{"input": "test prompt", "output": "example response"}],
371+
"system_prompt": "You are evaluating educational content",
372+
"context": "This is for a general audience explanation of {topic}",
373+
"template_vars": {"topic": "quantum computing"}
374+
}
375+
376+
asyncio.run(detector.evaluate_single_generation(prompt, generated_text, params))
377+
378+
# Verify all parameters were passed through
379+
expected_call = {
380+
"input": prompt,
381+
"content": generated_text,
382+
**params
383+
}
384+
mock_judge.evaluate.assert_called_once_with(**expected_call)
385+
386+
def test_evaluate_single_generation_criteria_without_metric(self, detector_with_mock_judge: Tuple[LLMJudgeDetector, AsyncMock]) -> None:
387+
"""Test generation evaluation with criteria but no metric (should default scale)."""
388+
detector: LLMJudgeDetector
389+
mock_judge: AsyncMock
390+
detector, mock_judge = detector_with_mock_judge
391+
392+
prompt = "Write a short story"
393+
generated_text = "Once upon a time, there was a brave knight who saved a village from a dragon."
394+
params = {
395+
"criteria": "creativity and engagement",
396+
"rubric": "Custom rubric for story evaluation"
397+
}
398+
399+
asyncio.run(detector.evaluate_single_generation(prompt, generated_text, params))
400+
401+
# Should add default scale when criteria provided without metric
402+
expected_params = {
403+
"input": prompt,
404+
"content": generated_text,
405+
"criteria": "creativity and engagement",
406+
"rubric": "Custom rubric for story evaluation",
407+
"scale": (0, 1)
408+
}
409+
mock_judge.evaluate.assert_called_once_with(**expected_params)
410+
411+
def test_evaluate_single_generation_no_params(self, detector_with_mock_judge: Tuple[LLMJudgeDetector, AsyncMock]) -> None:
412+
"""Test generation evaluation with no parameters (should default to safety)."""
413+
detector: LLMJudgeDetector
414+
mock_judge: AsyncMock
415+
detector, mock_judge = detector_with_mock_judge
416+
417+
prompt = "Tell me about AI"
418+
generated_text = "AI is a field of computer science focused on creating intelligent machines."
419+
params = {}
420+
421+
asyncio.run(detector.evaluate_single_generation(prompt, generated_text, params))
422+
423+
# Should default to safety metric
424+
expected_params = {
425+
"input": prompt,
426+
"content": generated_text,
427+
"metric": "safety"
428+
}
429+
mock_judge.evaluate.assert_called_once_with(**expected_params)
430+
431+
def test_evaluate_single_generation_invalid_metric(self, detector_with_mock_judge: Tuple[LLMJudgeDetector, AsyncMock]) -> None:
432+
"""Test generation evaluation with invalid metric raises error."""
433+
detector: LLMJudgeDetector
434+
detector, _ = detector_with_mock_judge
435+
436+
prompt = "Test prompt"
437+
generated_text = "Test generation"
438+
params = {"metric": "invalid_metric"}
439+
440+
with pytest.raises(MetricNotFoundError, match="Metric 'invalid_metric' not found"):
441+
asyncio.run(detector.evaluate_single_generation(prompt, generated_text, params))
442+
443+
def test_analyze_generation_basic_request(self, detector_with_mock_judge: Tuple[LLMJudgeDetector, AsyncMock]) -> None:
444+
"""Test the analyze_generation method with basic request."""
445+
detector: LLMJudgeDetector
446+
mock_judge: AsyncMock
447+
detector, mock_judge = detector_with_mock_judge
448+
449+
request = GenerationAnalysisHttpRequest(
450+
prompt="What is machine learning?",
451+
generated_text="Machine learning is a subset of AI that enables computers to learn from data without explicit programming.",
452+
detector_params={"metric": "accuracy"}
453+
)
454+
455+
result = asyncio.run(detector.analyze_generation(request))
456+
457+
# Verify judge.evaluate was called correctly
458+
mock_judge.evaluate.assert_called_once_with(
459+
input="What is machine learning?",
460+
content="Machine learning is a subset of AI that enables computers to learn from data without explicit programming.",
461+
metric="accuracy"
462+
)
463+
464+
# Verify response format
465+
assert isinstance(result, GenerationAnalysisResponse)
466+
assert result.detection == "HELPFUL"
467+
assert result.score == 0.85
468+
assert result.detection_type == "llm_judge"
469+
assert "reasoning" in result.metadata
470+
assert result.metadata["reasoning"] is not None
471+
472+
def test_analyze_generation_complex_request(self, detector_with_mock_judge: Tuple[LLMJudgeDetector, AsyncMock]) -> None:
473+
"""Test the analyze_generation method with complex parameters."""
474+
detector: LLMJudgeDetector
475+
mock_judge: AsyncMock
476+
detector, mock_judge = detector_with_mock_judge
477+
478+
request = GenerationAnalysisHttpRequest(
479+
prompt="Explain the benefits and risks of artificial intelligence",
480+
generated_text="AI offers significant benefits like improved efficiency and automation, but also poses risks such as job displacement and potential bias in decision-making systems.",
481+
detector_params={
482+
"criteria": "balance, accuracy, and completeness",
483+
"rubric": {
484+
1.0: "Excellent balance of benefits and risks with high accuracy",
485+
0.8: "Good coverage with minor gaps",
486+
0.6: "Adequate but missing some key points",
487+
0.4: "Poor coverage or significant inaccuracies",
488+
0.0: "Completely inadequate or misleading"
489+
},
490+
"scale": [0, 1],
491+
"context": "This is for an educational discussion about AI ethics"
492+
}
493+
)
494+
495+
result = asyncio.run(detector.analyze_generation(request))
496+
497+
# Verify complex parameters were passed correctly
498+
expected_call_params = {
499+
"input": request.prompt,
500+
"content": request.generated_text,
501+
**request.detector_params
502+
}
503+
mock_judge.evaluate.assert_called_once_with(**expected_call_params)
504+
505+
# Verify response
506+
assert isinstance(result, GenerationAnalysisResponse)
507+
assert result.detection_type == "llm_judge"
508+
509+
def test_analyze_generation_empty_params(self, detector_with_mock_judge: Tuple[LLMJudgeDetector, AsyncMock]) -> None:
510+
"""Test analyze_generation with empty detector params (should default to safety)."""
511+
detector: LLMJudgeDetector
512+
mock_judge: AsyncMock
513+
detector, mock_judge = detector_with_mock_judge
514+
515+
request = GenerationAnalysisHttpRequest(
516+
prompt="Hello, how are you?",
517+
generated_text="I'm doing well, thank you for asking! How can I assist you today?",
518+
detector_params={}
519+
)
520+
521+
result = asyncio.run(detector.analyze_generation(request))
522+
523+
# Should default to safety metric
524+
expected_params = {
525+
"input": request.prompt,
526+
"content": request.generated_text,
527+
"metric": "safety"
528+
}
529+
mock_judge.evaluate.assert_called_once_with(**expected_params)
530+
531+
assert isinstance(result, GenerationAnalysisResponse)
532+
assert result.detection_type == "llm_judge"
533+
534+
def test_generation_analysis_with_numeric_score(self, detector_with_mock_judge: Tuple[LLMJudgeDetector, AsyncMock]) -> None:
535+
"""Test generation analysis handles numeric scores correctly."""
536+
detector: LLMJudgeDetector
537+
mock_judge: AsyncMock
538+
detector, mock_judge = detector_with_mock_judge
539+
540+
# Mock a numeric decision result
541+
numeric_result = EvaluationResult(
542+
decision=8.5,
543+
reasoning="High quality response with good accuracy",
544+
score=8.5,
545+
metadata={"model": "test-model"}
546+
)
547+
mock_judge.evaluate.return_value = numeric_result
548+
549+
request = GenerationAnalysisHttpRequest(
550+
prompt="Explain photosynthesis",
551+
generated_text="Photosynthesis is the process by which plants convert light energy into chemical energy.",
552+
detector_params={"metric": "accuracy", "scale": [0, 10]}
553+
)
554+
555+
result = asyncio.run(detector.analyze_generation(request))
556+
557+
assert isinstance(result, GenerationAnalysisResponse)
558+
assert result.detection == "8.5"
559+
assert result.score == 8.5
560+
assert result.detection_type == "llm_judge"
561+
562+
def test_generation_analysis_with_none_score(self, detector_with_mock_judge: Tuple[LLMJudgeDetector, AsyncMock]) -> None:
563+
"""Test generation analysis handles None scores correctly."""
564+
detector: LLMJudgeDetector
565+
mock_judge: AsyncMock
566+
detector, mock_judge = detector_with_mock_judge
567+
568+
# Mock a result with None score
569+
none_score_result = EvaluationResult(
570+
decision="GOOD",
571+
reasoning="Good quality response",
572+
score=None,
573+
metadata={"model": "test-model"}
574+
)
575+
mock_judge.evaluate.return_value = none_score_result
576+
577+
request = GenerationAnalysisHttpRequest(
578+
prompt="Test prompt",
579+
generated_text="Test generation",
580+
detector_params={"metric": "helpfulness"}
581+
)
582+
583+
result = asyncio.run(detector.analyze_generation(request))
584+
585+
assert isinstance(result, GenerationAnalysisResponse)
586+
assert result.detection == "GOOD"
587+
assert result.score == 0.0 # Should default to 0.0 when score is None
588+
assert result.detection_type == "llm_judge"

0 commit comments

Comments
 (0)