1-
2- from unittest .mock import patch , MagicMock
1+ #!/usr/bin/env -S uv run --script
2+ # /// script
3+ # requires-python = "==3.11.11"
4+ # dependencies = [
5+ # "pandas==2.2.3",
6+ # "lighteval==0.10.0",
7+ # "openai==1.83.0",
8+ # "spacy==3.8.7",
9+ # "pytest==8.3.3",
10+ # "pytest-asyncio==0.24.0",
11+ # "pip"
12+ # ]
13+ # ///
314
415import pytest
516import pandas as pd
17+ from unittest .mock import Mock , patch , AsyncMock
18+ import tempfile
19+ import os
20+ import sys
21+
22+ sys .path .insert (0 , os .path .abspath (os .path .join (os .path .dirname (__file__ ), ".." )))
23+ from evaluation .evals import evaluate_response , calculate_cost_metrics , load_csv
24+
625
7- from evals import evaluate_response
26+ @pytest .fixture
27+ def mock_token_usage ():
28+ token_usage = Mock ()
29+ token_usage .input_tokens = 100
30+ token_usage .output_tokens = 50
31+ return token_usage
832
9- class MockTokenUsage :
10- def __init__ (self , input_tokens , output_tokens ):
11- self .input_tokens = input_tokens
12- self .output_tokens = output_tokens
1333
14- @patch ("evals.ModelFactory.get_handler" )
15- @patch ("evals.Extractiveness.compute" )
16- def test_evaluate_response (mock_extractiveness_compute , mock_get_handler ):
34+ @pytest .fixture
35+ def temp_csv ():
36+ def _create_csv (content ):
37+ with tempfile .NamedTemporaryFile (mode = "w" , suffix = ".csv" , delete = False ) as f :
38+ f .write (content )
39+ return f .name
1740
18- # Mock BaseModelHandler
19- mock_handler = MagicMock ()
20- mock_handler .handle_request .return_value = (
21- "This is a summary." ,
22- MockTokenUsage (input_tokens = 100 , output_tokens = 50 ),
23- {"input" : 15.0 , "output" : 30.0 }, # $15 and $30 per 1M tokens
24- 1.23 , # duration
41+ return _create_csv
42+
43+
44+ class TestCalculateCostMetrics :
45+ @pytest .mark .parametrize (
46+ "input_tokens,output_tokens,input_price,output_price,expected_input,expected_output,expected_total" ,
47+ [
48+ (1000 , 500 , 5.0 , 15.0 , 0.005 , 0.0075 , 0.0125 ),
49+ (0 , 0 , 5.0 , 15.0 , 0.0 , 0.0 , 0.0 ),
50+ (1_000_000 , 2_000_000 , 10.0 , 30.0 , 10.0 , 60.0 , 70.0 ),
51+ ],
2552 )
53+ def test_calculate_cost_metrics (
54+ self ,
55+ input_tokens ,
56+ output_tokens ,
57+ input_price ,
58+ output_price ,
59+ expected_input ,
60+ expected_output ,
61+ expected_total ,
62+ ):
63+ token_usage = Mock (input_tokens = input_tokens , output_tokens = output_tokens )
64+ pricing = {"input" : input_price , "output" : output_price }
65+
66+ result = calculate_cost_metrics (token_usage , pricing )
2667
27- mock_get_handler .return_value = mock_handler
68+ assert pytest .approx (result ["input_cost" ]) == expected_input
69+ assert pytest .approx (result ["output_cost" ]) == expected_output
70+ assert pytest .approx (result ["total_cost" ]) == expected_total
2871
29- mock_extractiveness_compute .return_value = {
30- "summarization_coverage" : 0.8 ,
31- "summarization_density" : 1.5 ,
32- "summarization_compression" : 2.0 ,
33- }
3472
35- df = evaluate_response (
36- model_name = "mock-model" ,
37- query = "What is the summary?" ,
38- context = "This is a long article about something important." ,
39- reference = "This is a reference summary." ,
73+ class TestLoadCsv :
74+ @pytest .mark .parametrize (
75+ "csv_content,required_columns,expected_len" ,
76+ [
77+ (
78+ "model,instructions\n gpt-4,Test prompt\n gpt-3.5,Another prompt\n " ,
79+ ["MODEL" , "INSTRUCTIONS" ],
80+ 2 ,
81+ ),
82+ (
83+ " model , instructions \n gpt-4,Test prompt\n " ,
84+ ["MODEL" , "INSTRUCTIONS" ],
85+ 1 ,
86+ ),
87+ ("input\n Test input 1\n Test input 2\n " , ["INPUT" ], 2 ),
88+ ],
4089 )
90+ def test_load_csv_valid (
91+ self , temp_csv , csv_content , required_columns , expected_len
92+ ):
93+ temp_path = temp_csv (csv_content )
94+ try :
95+ df = load_csv (temp_path , required_columns )
96+ assert len (df ) == expected_len
97+ assert list (df .columns ) == required_columns
98+ finally :
99+ os .unlink (temp_path )
100+
101+ @pytest .mark .parametrize (
102+ "csv_content,required_columns" ,
103+ [
104+ ("model,prompt\n gpt-4,Test prompt\n " , ["MODEL" , "INSTRUCTIONS" ]),
105+ ("wrong,columns\n val1,val2\n " , ["MODEL" , "INSTRUCTIONS" ]),
106+ ],
107+ )
108+ def test_load_csv_missing_columns (self , temp_csv , csv_content , required_columns ):
109+ temp_path = temp_csv (csv_content )
110+ try :
111+ with pytest .raises (ValueError , match = "must contain the following columns" ):
112+ load_csv (temp_path , required_columns )
113+ finally :
114+ os .unlink (temp_path )
115+
116+ def test_load_csv_nonexistent_file (self ):
117+ with pytest .raises (FileNotFoundError ):
118+ load_csv ("nonexistent_file.csv" , ["MODEL" ])
119+
120+
121+ class TestEvaluateResponse :
122+ @pytest .mark .asyncio
123+ async def test_evaluate_response_success (self , mock_token_usage ):
124+ mock_handler = AsyncMock ()
125+ mock_handler .handle_request .return_value = (
126+ "Generated response text" ,
127+ mock_token_usage ,
128+ {"input" : 5.0 , "output" : 15.0 },
129+ 1.5 ,
130+ )
131+
132+ mock_extractiveness = Mock ()
133+ mock_extractiveness .compute .return_value = {
134+ "summarization_coverage" : 0.8 ,
135+ "summarization_density" : 0.6 ,
136+ "summarization_compression" : 0.4 ,
137+ }
138+
139+ with (
140+ patch (
141+ "evaluation.evals.ModelFactory.get_handler" , return_value = mock_handler
142+ ),
143+ patch ("evaluation.evals.Extractiveness" , return_value = mock_extractiveness ),
144+ ):
145+ result = await evaluate_response ("gpt-4" , "Test instructions" , "Test input" )
146+
147+ assert isinstance (result , pd .DataFrame )
148+ assert len (result ) == 1
149+ row = result .iloc [0 ]
150+ assert row ["Generated Text" ] == "Generated response text"
151+ assert row ["Extractiveness Coverage" ] == 0.8
152+ assert row ["Input Token Usage" ] == 100
153+ assert row ["Output Token Usage" ] == 50
154+ assert row ["Duration (s)" ] == 1.5
155+
156+ @pytest .mark .parametrize (
157+ "exception_side_effect" , ["get_handler" , "handle_request" , "extractiveness" ]
158+ )
159+ @pytest .mark .asyncio
160+ async def test_evaluate_response_exceptions (
161+ self , mock_token_usage , exception_side_effect
162+ ):
163+ if exception_side_effect == "get_handler" :
164+ with patch (
165+ "evaluation.evals.ModelFactory.get_handler" ,
166+ side_effect = Exception ("Test error" ),
167+ ):
168+ result = await evaluate_response (
169+ "invalid-model" , "Test instructions" , "Test input"
170+ )
171+
172+ elif exception_side_effect == "handle_request" :
173+ mock_handler = AsyncMock ()
174+ mock_handler .handle_request .side_effect = Exception ("Handler error" )
175+ with patch (
176+ "evaluation.evals.ModelFactory.get_handler" , return_value = mock_handler
177+ ):
178+ result = await evaluate_response (
179+ "gpt-4" , "Test instructions" , "Test input"
180+ )
181+
182+ elif exception_side_effect == "extractiveness" :
183+ mock_handler = AsyncMock ()
184+ mock_handler .handle_request .return_value = (
185+ "text" ,
186+ mock_token_usage ,
187+ {"input" : 5.0 , "output" : 15.0 },
188+ 1.5 ,
189+ )
190+ mock_extractiveness = Mock ()
191+ mock_extractiveness .compute .side_effect = Exception ("Extractiveness error" )
192+
193+ with (
194+ patch (
195+ "evaluation.evals.ModelFactory.get_handler" ,
196+ return_value = mock_handler ,
197+ ),
198+ patch (
199+ "evaluation.evals.Extractiveness" , return_value = mock_extractiveness
200+ ),
201+ ):
202+ result = await evaluate_response (
203+ "gpt-4" , "Test instructions" , "Test input"
204+ )
205+
206+ assert isinstance (result , pd .DataFrame )
207+ assert len (result ) == 1
208+ assert pd .isna (result .iloc [0 ]["Generated Text" ])
209+
41210
42- assert isinstance (df , pd .DataFrame )
43- assert df .shape == (1 , 8 )
44- assert df ["Output Text" ].iloc [0 ] == "This is a summary."
45- assert df ["Extractiveness Coverage" ].iloc [0 ] == 0.8
46- assert df ["Extractiveness Density" ].iloc [0 ] == 1.5
47- assert df ["Extractiveness Compression" ].iloc [0 ] == 2.0
48- assert df ["Input Token Usage" ].iloc [0 ] == 100
49- assert df ["Output Token Usage" ].iloc [0 ] == 50
50-
51- expected_cost = (15.0 / 1_000_000 ) * 100 + (30.0 / 1_000_000 ) * 50
52- assert pytest .approx (df ["Cost (USD)" ].iloc [0 ], rel = 1e-4 ) == expected_cost
53- assert pytest .approx (df ["Duration (s)" ].iloc [0 ], rel = 1e-4 ) == 1.23
211+ if __name__ == "__main__" :
212+ pytest .main ([__file__ ])
0 commit comments