88import os
99import pandas as pd
1010from dotenv import load_dotenv
11- from openai import OpenAI
11+ from openai import OpenAI
1212
1313load_dotenv ()
1414
2121
2222
2323async def call_gpt4o_mini_async (client : OpenAI , prompt : str ) -> str :
24-
2524 messages = [{"role" : "user" , "content" : prompt }]
2625
2726 try :
2827 response = await asyncio .to_thread (
29- client .chat .completions .create ,
30- model = "gpt-4o-mini" ,
28+ client .chat .completions .create ,
29+ model = "gpt-4o-mini" ,
3130 messages = messages ,
3231 temperature = 0 ,
3332 max_tokens = 500 ,
34- timeout = 30.0
33+ timeout = 30.0 ,
3534 )
3635 return response .choices [0 ].message .content
3736 except Exception as e :
@@ -45,7 +44,7 @@ def parse_xml_response(response: str, tag: str) -> str:
4544
4645
4746async def evaluate_violate_preference_async (
48- client : OpenAI , preference : str , question : str , response : str
47+ client : OpenAI , preference : str , question : str , response : str
4948) -> Dict [str , str ]:
5049 prompt = f"""You will analyze a conversation between a user and an assistant, focusing on whether the assistant's response violates the user's stated preference.
5150Evaluate the response based on these criteria:
@@ -69,15 +68,15 @@ async def evaluate_violate_preference_async(
6968<explanation>[1 very short sentence explanation]</explanation>
7069<answer>[Yes/No]</answer>"""
7170
72- api_response = await call_gpt4o_mini_async (client , prompt )
71+ api_response = await call_gpt4o_mini_async (client , prompt )
7372 return {
7473 "explanation" : parse_xml_response (api_response , "explanation" ),
7574 "answer" : parse_xml_response (api_response , "answer" ),
7675 }
7776
7877
7978async def evaluate_acknowledge_preference_async (
80- client : OpenAI , question : str , response : str
79+ client : OpenAI , question : str , response : str
8180) -> Dict [str , str ]:
8281 prompt = f"""You will analyze a conversation between a user and an assistant, focusing on whether the assistant acknowledges any user preference in answering the user's query.
8382
@@ -99,15 +98,15 @@ async def evaluate_acknowledge_preference_async(
9998Examine the response meticulously and answer. Please answer in this exact XML format without any additional text:
10099<preference>[quote of the sentence that acknowledges/mentions what the preference is; leave it blank if there is none]</preference>
101100<answer>[Yes/No]</answer>"""
102- api_response = await call_gpt4o_mini_async (client , prompt )
101+ api_response = await call_gpt4o_mini_async (client , prompt )
103102 return {
104103 "preference_mention" : parse_xml_response (api_response , "preference" ),
105104 "answer" : parse_xml_response (api_response , "answer" ),
106105 }
107106
108107
109108async def evaluate_hallucinate_preference_async (
110- client : OpenAI , preference : str , restatement : str
109+ client : OpenAI , preference : str , restatement : str
111110) -> Dict [str , str ]:
112111 if not restatement .strip ():
113112 return {"explanation" : "No restatement provided by assistant" , "answer" : "No" }
@@ -132,15 +131,15 @@ async def evaluate_hallucinate_preference_async(
132131<explanation>[1 short sentence explanation]</explanation>
133132<answer>[Yes/No]</answer>"""
134133
135- api_response = await call_gpt4o_mini_async (client , prompt )
134+ api_response = await call_gpt4o_mini_async (client , prompt )
136135 return {
137136 "explanation" : parse_xml_response (api_response , "explanation" ),
138137 "answer" : parse_xml_response (api_response , "answer" ),
139138 }
140139
141140
142141async def evaluate_helpful_response_async (
143- client : OpenAI , question : str , response : str
142+ client : OpenAI , question : str , response : str
144143) -> Dict [str , str ]:
145144 prompt = f"""You will analyze a conversation between a user and an assistant, focusing on whether the assistant provides any substantive response to the user's query.
146145Evaluate the response based on these stringent criteria:
@@ -172,7 +171,7 @@ async def evaluate_helpful_response_async(
172171<explanation>[1 very short sentence explanation]</explanation>
173172<answer>[Yes/No]</answer>"""
174173
175- api_response = await call_gpt4o_mini_async (client , prompt )
174+ api_response = await call_gpt4o_mini_async (client , prompt )
176175 return {
177176 "explanation" : parse_xml_response (api_response , "explanation" ),
178177 "answer" : parse_xml_response (api_response , "answer" ),
@@ -197,9 +196,7 @@ def classify_error_type(evaluation_results: Dict[str, Any]) -> str:
197196 return "Personalized Response"
198197
199198
200- async def process_line (
201- line : str , client : OpenAI , semaphore : asyncio .Semaphore
202- ) -> Dict [str , Any ]:
199+ async def process_line (line : str , client : OpenAI , semaphore : asyncio .Semaphore ) -> Dict [str , Any ]:
203200 async with semaphore :
204201 data = json .loads (line .strip ())
205202 preference = data ["preference" ]
@@ -258,7 +255,7 @@ def generate_excel_summary(
258255 avg_search_time : float ,
259256 avg_context_tokens : float ,
260257 avg_add_time : float ,
261- model_name : str = "gpt-4o-mini" ,
258+ model_name : str = "gpt-4o-mini" ,
262259):
263260 print (f"Generating Excel summary at { OUTPUT_EXCEL_FILE } ..." )
264261
@@ -280,7 +277,7 @@ def get_pct(key):
280277 "Personalized Response\n 个性化回答" : [personalized_pct / 100 ],
281278 "context token" : [avg_context_tokens ],
282279 "Time添加" : [f"{ avg_add_time :.2f} s" ],
283- "Time搜索" : [f"{ avg_search_time :.2f} s" ]
280+ "Time搜索" : [f"{ avg_search_time :.2f} s" ],
284281 }
285282
286283 df = pd .DataFrame (data )
@@ -355,9 +352,9 @@ async def main(concurrency_limit: int):
355352 context_tokens = metrics .get ("memory_tokens_used" )
356353 add_time = metrics .get ("add_memories_duration_seconds" )
357354
358- all_metrics_valid = (search_time is not None and
359- add_time is not None and
360- context_tokens is not None )
355+ all_metrics_valid = (
356+ search_time is not None and add_time is not None and context_tokens is not None
357+ )
361358
362359 if all_metrics_valid :
363360 total_search_time += float (search_time )
@@ -375,7 +372,9 @@ async def main(concurrency_limit: int):
375372
376373 avg_search_time = (total_search_time / valid_metric_samples ) if valid_metric_samples > 0 else 0
377374 avg_add_time = (total_add_time / valid_metric_samples ) if valid_metric_samples > 0 else 0
378- avg_context_tokens = (total_context_tokens / valid_metric_samples ) if valid_metric_samples > 0 else 0
375+ avg_context_tokens = (
376+ (total_context_tokens / valid_metric_samples ) if valid_metric_samples > 0 else 0
377+ )
379378
380379 try :
381380 generate_excel_summary (
@@ -398,4 +397,4 @@ async def main(concurrency_limit: int):
398397 )
399398 args = parser .parse_args ()
400399
401- asyncio .run (main (concurrency_limit = args .concurrency_limit ))
400+ asyncio .run (main (concurrency_limit = args .concurrency_limit ))
0 commit comments