9
9
from detectors .llm_judge .scheme import (
10
10
ContentAnalysisHttpRequest ,
11
11
ContentAnalysisResponse ,
12
+ GenerationAnalysisHttpRequest ,
13
+ GenerationAnalysisResponse ,
12
14
)
13
15
14
16
# Import vLLM Judge components for mocking
15
17
from vllm_judge import EvaluationResult
16
18
from vllm_judge .exceptions import MetricNotFoundError
17
19
18
20
19
- class TestLLMJudgeDetector :
20
- """Test suite for LLMJudgeDetector."""
21
+ class TestLLMJudgeDetectorContentAnalysis :
22
+ """Test suite for LLMJudgeDetector content analysis ."""
21
23
22
24
@pytest .fixture
23
25
def mock_judge_result (self ) -> EvaluationResult :
@@ -73,6 +75,16 @@ def test_detector_initialization_unreachable_url(self) -> None:
73
75
with pytest .raises (Exception , match = "Failed to detect model" ):
74
76
LLMJudgeDetector ()
75
77
78
+ def test_close_detector (self , detector_with_mock_judge : Tuple [LLMJudgeDetector , AsyncMock ]) -> None :
79
+ """Test closing the detector properly closes the judge."""
80
+ detector : LLMJudgeDetector
81
+ mock_judge : AsyncMock
82
+ detector , mock_judge = detector_with_mock_judge
83
+
84
+ asyncio .run (detector .close ())
85
+
86
+ mock_judge .close .assert_called_once ()
87
+
76
88
def test_evaluate_single_content_basic_metric (self , detector_with_mock_judge : Tuple [LLMJudgeDetector , AsyncMock ]) -> None :
77
89
"""Test basic evaluation with just a metric."""
78
90
detector : LLMJudgeDetector
@@ -182,7 +194,7 @@ def test_run_single_content(self, detector_with_mock_judge: Tuple[LLMJudgeDetect
182
194
detector_params = {"metric" : "safety" }
183
195
)
184
196
185
- result = asyncio .run (detector .run (request ))
197
+ result = asyncio .run (detector .analyze_content (request ))
186
198
187
199
assert len (result ) == 1
188
200
assert len (result [0 ]) == 1
@@ -216,7 +228,7 @@ def test_run_multiple_contents(self, detector_with_mock_judge: Tuple[LLMJudgeDet
216
228
detector_params = {"metric" : "safety" }
217
229
)
218
230
219
- result = asyncio .run (detector .run (request ))
231
+ result = asyncio .run (detector .analyze_content (request ))
220
232
221
233
assert len (result ) == 3
222
234
for i , analysis_list in enumerate (result ):
@@ -252,7 +264,7 @@ def test_run_with_custom_evaluation_params(self, detector_with_mock_judge: Tuple
252
264
detector_params = custom_evaluation_params
253
265
)
254
266
255
- result = asyncio .run (detector .run (request ))
267
+ result = asyncio .run (detector .analyze_content (request ))
256
268
257
269
# Verify complex parameters were passed correctly
258
270
expected_call_params = {
@@ -285,3 +297,292 @@ def test_close_detector(self, detector_with_mock_judge: Tuple[LLMJudgeDetector,
285
297
asyncio .run (detector .close ())
286
298
287
299
mock_judge .close .assert_called_once ()
300
+
301
+
302
+ class TestLLMJudgeDetectorGenerationAnalysis :
303
+ """Test suite for LLMJudgeDetector generation analysis."""
304
+
305
+ @pytest .fixture
306
+ def mock_judge_result (self ) -> EvaluationResult :
307
+ """Mock EvaluationResult for generation testing."""
308
+ return EvaluationResult (
309
+ decision = "HELPFUL" ,
310
+ reasoning = "This generated response is helpful and addresses the user's question appropriately." ,
311
+ score = 0.85 ,
312
+ metadata = {"model" : "test-model" }
313
+ )
314
+
315
+ @pytest .fixture
316
+ def detector_with_mock_judge (self , mock_judge_result ) -> Tuple [LLMJudgeDetector , AsyncMock ]:
317
+ """Create detector with mocked Judge."""
318
+ with patch .dict (os .environ , {"VLLM_BASE_URL" : "http://test:8000" }):
319
+ with patch ('vllm_judge.Judge.from_url' ) as mock_judge_class :
320
+ # Create mock judge instance
321
+ mock_judge_instance = AsyncMock ()
322
+ mock_judge_instance .evaluate = AsyncMock (return_value = mock_judge_result )
323
+ mock_judge_instance .config .model = "test-model"
324
+ mock_judge_instance .config .base_url = "http://test:8000"
325
+ mock_judge_instance .close = AsyncMock ()
326
+
327
+ mock_judge_class .return_value = mock_judge_instance
328
+
329
+ detector = LLMJudgeDetector ()
330
+ return detector , mock_judge_instance
331
+
332
+ def test_evaluate_single_generation_basic_metric (self , detector_with_mock_judge : Tuple [LLMJudgeDetector , AsyncMock ]) -> None :
333
+ """Test basic generation evaluation with just a metric."""
334
+ detector : LLMJudgeDetector
335
+ mock_judge : AsyncMock
336
+ detector , mock_judge = detector_with_mock_judge
337
+
338
+ prompt = "What is artificial intelligence?"
339
+ generated_text = "Artificial intelligence (AI) refers to the simulation of human intelligence in machines."
340
+ params = {"metric" : "helpfulness" }
341
+
342
+ result = asyncio .run (detector .evaluate_single_generation (prompt , generated_text , params ))
343
+
344
+ # Verify judge.evaluate was called correctly
345
+ mock_judge .evaluate .assert_called_once_with (
346
+ input = prompt ,
347
+ content = generated_text ,
348
+ metric = "helpfulness"
349
+ )
350
+
351
+ # Verify response format
352
+ assert isinstance (result , GenerationAnalysisResponse )
353
+ assert result .detection == "HELPFUL"
354
+ assert result .score == 0.85
355
+ assert result .detection_type == "llm_judge"
356
+ assert "reasoning" in result .metadata
357
+
358
+ def test_evaluate_single_generation_full_parameters (self , detector_with_mock_judge : Tuple [LLMJudgeDetector , AsyncMock ]) -> None :
359
+ """Test generation evaluation with all vLLM Judge parameters."""
360
+ detector : LLMJudgeDetector
361
+ mock_judge : AsyncMock
362
+ detector , mock_judge = detector_with_mock_judge
363
+
364
+ prompt = "Explain quantum computing in simple terms"
365
+ generated_text = "Quantum computing uses quantum bits (qubits) that can exist in multiple states simultaneously, allowing for parallel processing of information."
366
+ params = {
367
+ "criteria" : "accuracy, clarity, and completeness" ,
368
+ "rubric" : "Score based on technical accuracy and accessibility" ,
369
+ "scale" : [1 , 10 ],
370
+ "examples" : [{"input" : "test prompt" , "output" : "example response" }],
371
+ "system_prompt" : "You are evaluating educational content" ,
372
+ "context" : "This is for a general audience explanation of {topic}" ,
373
+ "template_vars" : {"topic" : "quantum computing" }
374
+ }
375
+
376
+ asyncio .run (detector .evaluate_single_generation (prompt , generated_text , params ))
377
+
378
+ # Verify all parameters were passed through
379
+ expected_call = {
380
+ "input" : prompt ,
381
+ "content" : generated_text ,
382
+ ** params
383
+ }
384
+ mock_judge .evaluate .assert_called_once_with (** expected_call )
385
+
386
+ def test_evaluate_single_generation_criteria_without_metric (self , detector_with_mock_judge : Tuple [LLMJudgeDetector , AsyncMock ]) -> None :
387
+ """Test generation evaluation with criteria but no metric (should default scale)."""
388
+ detector : LLMJudgeDetector
389
+ mock_judge : AsyncMock
390
+ detector , mock_judge = detector_with_mock_judge
391
+
392
+ prompt = "Write a short story"
393
+ generated_text = "Once upon a time, there was a brave knight who saved a village from a dragon."
394
+ params = {
395
+ "criteria" : "creativity and engagement" ,
396
+ "rubric" : "Custom rubric for story evaluation"
397
+ }
398
+
399
+ asyncio .run (detector .evaluate_single_generation (prompt , generated_text , params ))
400
+
401
+ # Should add default scale when criteria provided without metric
402
+ expected_params = {
403
+ "input" : prompt ,
404
+ "content" : generated_text ,
405
+ "criteria" : "creativity and engagement" ,
406
+ "rubric" : "Custom rubric for story evaluation" ,
407
+ "scale" : (0 , 1 )
408
+ }
409
+ mock_judge .evaluate .assert_called_once_with (** expected_params )
410
+
411
+ def test_evaluate_single_generation_no_params (self , detector_with_mock_judge : Tuple [LLMJudgeDetector , AsyncMock ]) -> None :
412
+ """Test generation evaluation with no parameters (should default to safety)."""
413
+ detector : LLMJudgeDetector
414
+ mock_judge : AsyncMock
415
+ detector , mock_judge = detector_with_mock_judge
416
+
417
+ prompt = "Tell me about AI"
418
+ generated_text = "AI is a field of computer science focused on creating intelligent machines."
419
+ params = {}
420
+
421
+ asyncio .run (detector .evaluate_single_generation (prompt , generated_text , params ))
422
+
423
+ # Should default to safety metric
424
+ expected_params = {
425
+ "input" : prompt ,
426
+ "content" : generated_text ,
427
+ "metric" : "safety"
428
+ }
429
+ mock_judge .evaluate .assert_called_once_with (** expected_params )
430
+
431
+ def test_evaluate_single_generation_invalid_metric (self , detector_with_mock_judge : Tuple [LLMJudgeDetector , AsyncMock ]) -> None :
432
+ """Test generation evaluation with invalid metric raises error."""
433
+ detector : LLMJudgeDetector
434
+ detector , _ = detector_with_mock_judge
435
+
436
+ prompt = "Test prompt"
437
+ generated_text = "Test generation"
438
+ params = {"metric" : "invalid_metric" }
439
+
440
+ with pytest .raises (MetricNotFoundError , match = "Metric 'invalid_metric' not found" ):
441
+ asyncio .run (detector .evaluate_single_generation (prompt , generated_text , params ))
442
+
443
+ def test_analyze_generation_basic_request (self , detector_with_mock_judge : Tuple [LLMJudgeDetector , AsyncMock ]) -> None :
444
+ """Test the analyze_generation method with basic request."""
445
+ detector : LLMJudgeDetector
446
+ mock_judge : AsyncMock
447
+ detector , mock_judge = detector_with_mock_judge
448
+
449
+ request = GenerationAnalysisHttpRequest (
450
+ prompt = "What is machine learning?" ,
451
+ generated_text = "Machine learning is a subset of AI that enables computers to learn from data without explicit programming." ,
452
+ detector_params = {"metric" : "accuracy" }
453
+ )
454
+
455
+ result = asyncio .run (detector .analyze_generation (request ))
456
+
457
+ # Verify judge.evaluate was called correctly
458
+ mock_judge .evaluate .assert_called_once_with (
459
+ input = "What is machine learning?" ,
460
+ content = "Machine learning is a subset of AI that enables computers to learn from data without explicit programming." ,
461
+ metric = "accuracy"
462
+ )
463
+
464
+ # Verify response format
465
+ assert isinstance (result , GenerationAnalysisResponse )
466
+ assert result .detection == "HELPFUL"
467
+ assert result .score == 0.85
468
+ assert result .detection_type == "llm_judge"
469
+ assert "reasoning" in result .metadata
470
+ assert result .metadata ["reasoning" ] is not None
471
+
472
+ def test_analyze_generation_complex_request (self , detector_with_mock_judge : Tuple [LLMJudgeDetector , AsyncMock ]) -> None :
473
+ """Test the analyze_generation method with complex parameters."""
474
+ detector : LLMJudgeDetector
475
+ mock_judge : AsyncMock
476
+ detector , mock_judge = detector_with_mock_judge
477
+
478
+ request = GenerationAnalysisHttpRequest (
479
+ prompt = "Explain the benefits and risks of artificial intelligence" ,
480
+ generated_text = "AI offers significant benefits like improved efficiency and automation, but also poses risks such as job displacement and potential bias in decision-making systems." ,
481
+ detector_params = {
482
+ "criteria" : "balance, accuracy, and completeness" ,
483
+ "rubric" : {
484
+ 1.0 : "Excellent balance of benefits and risks with high accuracy" ,
485
+ 0.8 : "Good coverage with minor gaps" ,
486
+ 0.6 : "Adequate but missing some key points" ,
487
+ 0.4 : "Poor coverage or significant inaccuracies" ,
488
+ 0.0 : "Completely inadequate or misleading"
489
+ },
490
+ "scale" : [0 , 1 ],
491
+ "context" : "This is for an educational discussion about AI ethics"
492
+ }
493
+ )
494
+
495
+ result = asyncio .run (detector .analyze_generation (request ))
496
+
497
+ # Verify complex parameters were passed correctly
498
+ expected_call_params = {
499
+ "input" : request .prompt ,
500
+ "content" : request .generated_text ,
501
+ ** request .detector_params
502
+ }
503
+ mock_judge .evaluate .assert_called_once_with (** expected_call_params )
504
+
505
+ # Verify response
506
+ assert isinstance (result , GenerationAnalysisResponse )
507
+ assert result .detection_type == "llm_judge"
508
+
509
+ def test_analyze_generation_empty_params (self , detector_with_mock_judge : Tuple [LLMJudgeDetector , AsyncMock ]) -> None :
510
+ """Test analyze_generation with empty detector params (should default to safety)."""
511
+ detector : LLMJudgeDetector
512
+ mock_judge : AsyncMock
513
+ detector , mock_judge = detector_with_mock_judge
514
+
515
+ request = GenerationAnalysisHttpRequest (
516
+ prompt = "Hello, how are you?" ,
517
+ generated_text = "I'm doing well, thank you for asking! How can I assist you today?" ,
518
+ detector_params = {}
519
+ )
520
+
521
+ result = asyncio .run (detector .analyze_generation (request ))
522
+
523
+ # Should default to safety metric
524
+ expected_params = {
525
+ "input" : request .prompt ,
526
+ "content" : request .generated_text ,
527
+ "metric" : "safety"
528
+ }
529
+ mock_judge .evaluate .assert_called_once_with (** expected_params )
530
+
531
+ assert isinstance (result , GenerationAnalysisResponse )
532
+ assert result .detection_type == "llm_judge"
533
+
534
+ def test_generation_analysis_with_numeric_score (self , detector_with_mock_judge : Tuple [LLMJudgeDetector , AsyncMock ]) -> None :
535
+ """Test generation analysis handles numeric scores correctly."""
536
+ detector : LLMJudgeDetector
537
+ mock_judge : AsyncMock
538
+ detector , mock_judge = detector_with_mock_judge
539
+
540
+ # Mock a numeric decision result
541
+ numeric_result = EvaluationResult (
542
+ decision = 8.5 ,
543
+ reasoning = "High quality response with good accuracy" ,
544
+ score = 8.5 ,
545
+ metadata = {"model" : "test-model" }
546
+ )
547
+ mock_judge .evaluate .return_value = numeric_result
548
+
549
+ request = GenerationAnalysisHttpRequest (
550
+ prompt = "Explain photosynthesis" ,
551
+ generated_text = "Photosynthesis is the process by which plants convert light energy into chemical energy." ,
552
+ detector_params = {"metric" : "accuracy" , "scale" : [0 , 10 ]}
553
+ )
554
+
555
+ result = asyncio .run (detector .analyze_generation (request ))
556
+
557
+ assert isinstance (result , GenerationAnalysisResponse )
558
+ assert result .detection == "8.5"
559
+ assert result .score == 8.5
560
+ assert result .detection_type == "llm_judge"
561
+
562
+ def test_generation_analysis_with_none_score (self , detector_with_mock_judge : Tuple [LLMJudgeDetector , AsyncMock ]) -> None :
563
+ """Test generation analysis handles None scores correctly."""
564
+ detector : LLMJudgeDetector
565
+ mock_judge : AsyncMock
566
+ detector , mock_judge = detector_with_mock_judge
567
+
568
+ # Mock a result with None score
569
+ none_score_result = EvaluationResult (
570
+ decision = "GOOD" ,
571
+ reasoning = "Good quality response" ,
572
+ score = None ,
573
+ metadata = {"model" : "test-model" }
574
+ )
575
+ mock_judge .evaluate .return_value = none_score_result
576
+
577
+ request = GenerationAnalysisHttpRequest (
578
+ prompt = "Test prompt" ,
579
+ generated_text = "Test generation" ,
580
+ detector_params = {"metric" : "helpfulness" }
581
+ )
582
+
583
+ result = asyncio .run (detector .analyze_generation (request ))
584
+
585
+ assert isinstance (result , GenerationAnalysisResponse )
586
+ assert result .detection == "GOOD"
587
+ assert result .score == 0.0 # Should default to 0.0 when score is None
588
+ assert result .detection_type == "llm_judge"
0 commit comments