1- import asyncio
21import ast
3- import pandas as pd
4- import nbformat
2+ import asyncio
53import json
4+ import operator
65
7- from fhda . utils import view_notebook
8- import postprocessing_utils as utils
6+ import nbformat
7+ import pandas as pd
98import plotting_utils
9+ import postprocessing_utils as utils
10+ from fhda .utils import view_notebook
1011
1112pd .options .mode .chained_assignment = None
1213
14+
1315def load_raw_data (path : str ):
1416 """
1517 Load raw data from a CSV file and process specific columns.
16-
18+
1719 Args:
1820 path (str): Path to the CSV file containing raw data
19-
21+
2022 Returns:
2123 pd.DataFrame: Processed DataFrame with converted column types
2224 """
@@ -37,7 +39,7 @@ def load_raw_data(path: str):
3739 df [col ] = df [col ].apply (func )
3840
3941 # Convert json notebook to markdown for postprocessing
40- if "nb" in df .columns and not "nb_md" in df .columns :
42+ if "nb" in df .columns and "nb_md" not in df .columns :
4143 df_md = pd .DataFrame (
4244 df ["nb" ].apply (lambda x : view_notebook (x .cells , "python" )).tolist (),
4345 columns = ["md_notebook" , "md_images" ],
@@ -50,10 +52,10 @@ def load_raw_data(path: str):
5052async def process_trajectories (df : pd .DataFrame ):
5153 """
5254 Create a gradable dataframe from a raw dataframe of trajectories.
53-
55+
5456 This function processes the raw data, runs evaluation loops, and saves
5557 the results to CSV files for further analysis.
56-
58+
5759 Args:
5860 df (pd.DataFrame): Raw data containing model trajectories
5961 """
@@ -67,7 +69,7 @@ async def process_trajectories(df: pd.DataFrame):
6769 # Create correct column for open ended questions
6870 eval_df .loc [eval_df .question_format == "open" , "correct" ] = eval_df .loc [
6971 eval_df .question_format == "open" , "llm_answer"
70- ].apply (lambda x : True if x == "1" else False )
72+ ].apply (lambda x : x == "1" )
7173 # Extract XML from LLM MCQ answers
7274 eval_df .loc [eval_df .question_format == "mcq" , "llm_answer" ] = eval_df .loc [
7375 eval_df .question_format == "mcq" , "llm_answer"
@@ -85,7 +87,7 @@ async def process_trajectories(df: pd.DataFrame):
8587async def run_majority_vote ():
8688 """
8789 Implement majority voting evaluation across different model configurations.
88-
90+
8991 This function reads evaluation data, performs majority voting analysis for
9092 multiple choice questions, and produces visualization comparing different model
9193 configurations with and without specific features.
@@ -106,7 +108,7 @@ async def run_majority_vote():
106108 grouped_df ["llm_answer" ] = grouped_df ["llm_answer" ].fillna ("X" )
107109 grouped_df = grouped_df .groupby ("uuid" ).agg (list )
108110 grouped_df ["correct_letter" ] = grouped_df ["correct_letter" ].apply (
109- lambda x : x [ 0 ]
111+ operator . itemgetter ( 0 )
110112 )
111113 grouped_df = grouped_df .dropna ()
112114 k_values , means , stds = utils .run_majority_voting (
@@ -143,7 +145,7 @@ async def run_majority_vote():
143145async def compare_capsule_mode ():
144146 """
145147 Compare performance between different model architectures.
146-
148+
147149 This function analyzes and visualizes the performance differences between
148150 GPT-4o and Claude models across different question formats.
149151 """
@@ -175,10 +177,10 @@ async def compare_capsule_mode():
175177def calculate_results (df ):
176178 """
177179 Calculate means and confidence intervals for each model and format.
178-
180+
179181 Args:
180182 df (pd.DataFrame): DataFrame containing model evaluation results
181-
183+
182184 Returns:
183185 list: List of dictionaries containing statistical results for each model and format
184186 """
@@ -206,7 +208,7 @@ def calculate_results(df):
206208async def compare_capsule_mode_with_refusal ():
207209 """
208210 Compare models with refusal mode enabled.
209-
211+
210212 This function loads evaluation data, processes it to compare how different models
211213 perform when the refusal option is available, and visualizes the results.
212214 """
@@ -219,10 +221,14 @@ async def compare_capsule_mode_with_refusal():
219221
220222 # Filter to include only runs with refusal option enabled
221223 tmp = tmp [tmp .run_name .str .contains ("with_refusal" )]
222-
224+
223225 tmp ["model" ] = tmp ["run_name" ].apply (lambda x : model1 if "4o" in x else model2 )
224- tmp ["vision" ] = tmp ["run_name" ].apply (lambda x : "With Vision" if "image" in x and "no_image" not in x else "Without Vision" )
225-
226+ tmp ["vision" ] = tmp ["run_name" ].apply (
227+ lambda x : (
228+ "With Vision" if "image" in x and "no_image" not in x else "Without Vision"
229+ )
230+ )
231+
226232 # Calculate means and confidence intervals
227233 results = calculate_results_for_refusal (tmp )
228234 print (results )
@@ -234,10 +240,10 @@ async def compare_capsule_mode_with_refusal():
234240def calculate_results_for_refusal (df ):
235241 """
236242 Calculate means and confidence intervals for refusal mode comparison.
237-
243+
238244 Args:
239245 df (pd.DataFrame): DataFrame containing model evaluation results
240-
246+
241247 Returns:
242248 list: List of dictionaries containing statistical results for each model and vision mode
243249 """
@@ -268,4 +274,4 @@ def calculate_results_for_refusal(df):
268274 asyncio .run (process_trajectories (data ))
269275 asyncio .run (run_majority_vote ())
270276 asyncio .run (compare_capsule_mode ())
271- asyncio .run (compare_capsule_mode_with_refusal ())
277+ asyncio .run (compare_capsule_mode_with_refusal ())
0 commit comments